3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 dst -= texture->mipmap[0][2] * 4;
755 memcpy(dst, pixels, blockwidth * 4);
756 pixels += blockwidth * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 int i, stride = texture->mipmap[0][2]*4;
771 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772 for (i = texture->mipmap[0][3];i > 0;i--)
775 memcpy(dst, pixels, stride);
779 DPSOFTRAST_Texture_CalculateMipmaps(index);
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
783 DPSOFTRAST_Texture *texture;
784 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->mipmap[mip][2];
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791 return texture->mipmap[mip][3];
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
795 DPSOFTRAST_Texture *texture;
796 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797 return texture->mipmap[mip][4];
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
801 DPSOFTRAST_Texture *texture;
802 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
805 return texture->bytes + texture->mipmap[mip][0];
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
809 DPSOFTRAST_Texture *texture;
810 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
813 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
818 texture->filter = filter;
821 static void DPSOFTRAST_Draw_FlushThreads(void);
823 static void DPSOFTRAST_Draw_SyncCommands(void)
825 if(dpsoftrast.usethreads) MEMORY_BARRIER;
826 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
831 DPSOFTRAST_State_Thread *thread;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
837 DPSOFTRAST_Draw_SyncCommands();
843 for (i = 0; i < dpsoftrast.numthreads; i++)
845 thread = &dpsoftrast.threads[i];
846 commandoffset = freecommand - thread->commandoffset;
847 if (commandoffset < 0)
848 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849 if (commandoffset > usedcommands)
852 usedcommands = commandoffset;
855 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
857 thread = &dpsoftrast.threads[waitindex];
858 Thread_LockMutex(thread->drawmutex);
859 if (thread->commandoffset != dpsoftrast.drawcommand)
861 thread->waiting = true;
862 if (thread->starving) Thread_CondSignal(thread->drawcond);
863 Thread_CondWait(thread->waitcond, thread->drawmutex);
864 thread->waiting = false;
866 Thread_UnlockMutex(thread->drawmutex);
868 dpsoftrast.commandpool.usedcommands = usedcommands;
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
878 DPSOFTRAST_Command *command;
879 int freecommand = dpsoftrast.commandpool.freecommand;
880 int usedcommands = dpsoftrast.commandpool.usedcommands;
881 int extra = sizeof(DPSOFTRAST_Command);
882 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
886 if (dpsoftrast.usethreads)
887 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
889 DPSOFTRAST_Draw_FlushThreads();
890 freecommand = dpsoftrast.commandpool.freecommand;
891 usedcommands = dpsoftrast.commandpool.usedcommands;
893 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
895 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896 command->opcode = DPSOFTRAST_OPCODE_Reset;
897 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
900 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901 command->opcode = opcode;
902 command->commandsize = size;
904 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
906 dpsoftrast.commandpool.freecommand = freecommand;
907 dpsoftrast.commandpool.usedcommands = usedcommands + size;
911 static void DPSOFTRAST_UndoCommand(int size)
913 int freecommand = dpsoftrast.commandpool.freecommand;
914 int usedcommands = dpsoftrast.commandpool.usedcommands;
917 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918 usedcommands -= size;
919 dpsoftrast.commandpool.freecommand = freecommand;
920 dpsoftrast.commandpool.usedcommands = usedcommands;
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
926 thread->viewport[0] = command->x;
927 thread->viewport[1] = command->y;
928 thread->viewport[2] = command->width;
929 thread->viewport[3] = command->height;
930 thread->validate |= DPSOFTRAST_VALIDATE_FB;
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
934 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
937 command->width = width;
938 command->height = height;
940 dpsoftrast.viewport[0] = x;
941 dpsoftrast.viewport[1] = y;
942 dpsoftrast.viewport[2] = width;
943 dpsoftrast.viewport[3] = height;
944 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
950 int i, x1, y1, x2, y2, w, h, x, y;
951 int miny1, maxy1, miny2, maxy2;
955 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956 miny1 = thread->miny1;
957 maxy1 = thread->maxy1;
958 miny2 = thread->miny2;
959 maxy2 = thread->maxy2;
960 x1 = thread->fb_scissor[0];
961 y1 = thread->fb_scissor[1];
962 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964 if (y1 < miny1) y1 = miny1;
965 if (y2 > maxy2) y2 = maxy2;
970 // FIXME: honor fb_colormask?
971 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972 for (i = 0;i < 4;i++)
974 if (!dpsoftrast.fb_colorpixels[i])
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
987 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
997 int x1, y1, x2, y2, w, h, x, y;
998 int miny1, maxy1, miny2, maxy2;
1002 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003 miny1 = thread->miny1;
1004 maxy1 = thread->maxy1;
1005 miny2 = thread->miny2;
1006 maxy2 = thread->maxy2;
1007 x1 = thread->fb_scissor[0];
1008 y1 = thread->fb_scissor[1];
1009 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011 if (y1 < miny1) y1 = miny1;
1012 if (y2 > maxy2) y2 = maxy2;
1017 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019 for (;y < bandy;y++)
1021 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022 for (x = x1;x < x2;x++)
1026 void DPSOFTRAST_ClearDepth(float d)
1028 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1035 thread->colormask[0] = command->r != 0;
1036 thread->colormask[1] = command->g != 0;
1037 thread->colormask[2] = command->b != 0;
1038 thread->colormask[3] = command->a != 0;
1039 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1043 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1053 thread->depthtest = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1056 void DPSOFTRAST_DepthTest(int enable)
1058 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059 command->enable = enable;
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1065 thread->scissortest = command->enable;
1066 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1068 void DPSOFTRAST_ScissorTest(int enable)
1070 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071 command->enable = enable;
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1077 thread->scissor[0] = command->x;
1078 thread->scissor[1] = command->y;
1079 thread->scissor[2] = command->width;
1080 thread->scissor[3] = command->height;
1081 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1085 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1088 command->width = width;
1089 command->height = height;
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1095 thread->blendfunc[0] = command->sfactor;
1096 thread->blendfunc[1] = command->dfactor;
1097 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1101 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102 command->sfactor = sfactor;
1103 command->dfactor = dfactor;
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1109 thread->blendsubtract = command->enable;
1110 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1112 void DPSOFTRAST_BlendSubtract(int enable)
1114 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115 command->enable = enable;
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1121 thread->depthmask = command->enable;
1123 void DPSOFTRAST_DepthMask(int enable)
1125 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126 command->enable = enable;
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1132 thread->depthfunc = command->func;
1134 void DPSOFTRAST_DepthFunc(int func)
1136 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137 command->func = func;
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1143 thread->depthrange[0] = command->nearval;
1144 thread->depthrange[1] = command->farval;
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1148 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149 command->nearval = nearval;
1150 command->farval = farval;
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1156 thread->polygonoffset[0] = command->alongnormal;
1157 thread->polygonoffset[1] = command->intoview;
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1161 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162 command->alongnormal = alongnormal;
1163 command->intoview = intoview;
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1169 thread->cullface = command->mode;
1171 void DPSOFTRAST_CullFace(int mode)
1173 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174 command->mode = mode;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - sy1 - th;
1282 ty1 = theight - ty1 - th;
1283 for (y = 0;y < th;y++)
1284 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285 if (texture->mipmaps > 1)
1286 DPSOFTRAST_Texture_CalculateMipmaps(index);
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1292 if (thread->texbound[command->unitnum])
1293 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294 thread->texbound[command->unitnum] = command->texture;
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1298 DPSOFTRAST_Command_SetTexture *command;
1299 DPSOFTRAST_Texture *texture;
1300 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1302 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1305 texture = DPSOFTRAST_Texture_GetByIndex(index);
1306 if (index && !texture)
1308 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313 command->unitnum = unitnum;
1314 command->texture = texture;
1316 dpsoftrast.texbound[unitnum] = texture;
1317 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1320 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1322 dpsoftrast.pointer_vertex3f = vertex3f;
1323 dpsoftrast.stride_vertex = stride;
1325 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1327 dpsoftrast.pointer_color4f = color4f;
1328 dpsoftrast.pointer_color4ub = NULL;
1329 dpsoftrast.stride_color = stride;
1331 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1333 dpsoftrast.pointer_color4f = NULL;
1334 dpsoftrast.pointer_color4ub = color4ub;
1335 dpsoftrast.stride_color = stride;
1337 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1339 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1340 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1341 dpsoftrast.stride_texcoord[unitnum] = stride;
1344 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1345 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1347 thread->shader_mode = command->mode;
1348 thread->shader_permutation = command->permutation;
1349 thread->shader_exactspecularmath = command->exactspecularmath;
1351 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1353 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1354 command->mode = mode;
1355 command->permutation = permutation;
1356 command->exactspecularmath = exactspecularmath;
1358 dpsoftrast.shader_mode = mode;
1359 dpsoftrast.shader_permutation = permutation;
1360 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1363 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1364 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1366 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1368 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1370 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1371 command->index = index;
1372 command->val[0] = v0;
1373 command->val[1] = v1;
1374 command->val[2] = v2;
1375 command->val[3] = v3;
1377 dpsoftrast.uniform4f[index*4+0] = v0;
1378 dpsoftrast.uniform4f[index*4+1] = v1;
1379 dpsoftrast.uniform4f[index*4+2] = v2;
1380 dpsoftrast.uniform4f[index*4+3] = v3;
1382 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1384 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1385 command->index = index;
1386 memcpy(command->val, v, sizeof(command->val));
1388 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1391 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1392 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1394 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1396 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1400 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1402 __m128 m0, m1, m2, m3;
1403 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1404 command->index = (DPSOFTRAST_UNIFORM)index;
1405 if (((size_t)v)&(ALIGN_SIZE-1))
1407 m0 = _mm_loadu_ps(v);
1408 m1 = _mm_loadu_ps(v+4);
1409 m2 = _mm_loadu_ps(v+8);
1410 m3 = _mm_loadu_ps(v+12);
1414 m0 = _mm_load_ps(v);
1415 m1 = _mm_load_ps(v+4);
1416 m2 = _mm_load_ps(v+8);
1417 m3 = _mm_load_ps(v+12);
1421 __m128 t0, t1, t2, t3;
1422 t0 = _mm_unpacklo_ps(m0, m1);
1423 t1 = _mm_unpacklo_ps(m2, m3);
1424 t2 = _mm_unpackhi_ps(m0, m1);
1425 t3 = _mm_unpackhi_ps(m2, m3);
1426 m0 = _mm_movelh_ps(t0, t1);
1427 m1 = _mm_movehl_ps(t1, t0);
1428 m2 = _mm_movelh_ps(t2, t3);
1429 m3 = _mm_movehl_ps(t3, t2);
1431 _mm_store_ps(command->val, m0);
1432 _mm_store_ps(command->val+4, m1);
1433 _mm_store_ps(command->val+8, m2);
1434 _mm_store_ps(command->val+12, m3);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1443 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1444 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1446 thread->uniform1i[command->index] = command->val;
1448 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1450 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1451 command->index = index;
1454 dpsoftrast.uniform1i[command->index] = i0;
1457 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1458 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1460 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1461 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1463 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1465 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1466 command->clipplane[0] = x;
1467 command->clipplane[1] = y;
1468 command->clipplane[2] = z;
1469 command->clipplane[3] = w;
1473 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1475 float *end = dst + size*4;
1476 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1480 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1489 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1496 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1498 float *end = dst + size*4;
1499 if (stride == sizeof(float[3]))
1501 float *end4 = dst + (size&~3)*4;
1502 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1506 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1507 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1508 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1511 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1514 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1515 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1518 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 src += 4*sizeof(float[3]);
1527 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1528 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1529 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1532 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1535 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1536 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1537 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1539 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541 src += 4*sizeof(float[3]);
1545 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1549 __m128 v = _mm_loadu_ps((const float *)src);
1550 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1551 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1552 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1553 _mm_store_ps(dst, v);
1562 __m128 v = _mm_load_ps((const float *)src);
1563 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1564 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1565 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1566 _mm_store_ps(dst, v);
1573 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 float *end = dst + size*4;
1576 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1577 if (stride == sizeof(float[2]))
1579 float *end2 = dst + (size&~1)*4;
1580 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584 __m128 v = _mm_loadu_ps((const float *)src);
1585 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1586 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588 src += 2*sizeof(float[2]);
1595 __m128 v = _mm_load_ps((const float *)src);
1596 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1597 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1599 src += 2*sizeof(float[2]);
1605 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1611 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1613 float *end = dst + size*4;
1614 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1615 if (stride == sizeof(unsigned char[4]))
1617 float *end4 = dst + (size&~3)*4;
1618 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1622 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1623 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1626 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1628 src += 4*sizeof(unsigned char[4]);
1635 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1636 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1639 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1641 src += 4*sizeof(unsigned char[4]);
1647 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1648 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1654 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1656 float *end = dst + 4*size;
1657 __m128 v = _mm_loadu_ps(src);
1660 _mm_store_ps(dst, v);
1666 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1669 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1670 __m128 m0, m1, m2, m3;
1672 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1674 // fast case for identity matrix
1675 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1678 end = out4f + numitems*4;
1679 m0 = _mm_loadu_ps(inmatrix16f);
1680 m1 = _mm_loadu_ps(inmatrix16f + 4);
1681 m2 = _mm_loadu_ps(inmatrix16f + 8);
1682 m3 = _mm_loadu_ps(inmatrix16f + 12);
1683 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1687 __m128 v = _mm_loadu_ps(in4f);
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1701 __m128 v = _mm_load_ps(in4f);
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1705 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1706 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1715 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1717 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1722 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1725 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1726 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1727 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1730 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1733 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1734 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1735 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1738 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1741 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1742 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1744 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1747 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 int clipmask = 0xFF;
1750 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1751 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1752 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1753 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1754 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1755 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1756 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1757 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1758 #define BBFRONT(k, pos) \
1760 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1761 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1762 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1765 clipmask &= ~(1<<k); \
1766 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1767 minproj = _mm_min_ss(minproj, proj); \
1768 maxproj = _mm_max_ss(maxproj, proj); \
1772 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1773 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1781 if (clipmask&(1<<k)) \
1783 if (!(clipmask&(1<<(k^1)))) \
1785 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1786 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1787 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788 minproj = _mm_min_ss(minproj, proj); \
1789 maxproj = _mm_max_ss(maxproj, proj); \
1791 if (!(clipmask&(1<<(k^2)))) \
1793 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1794 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1795 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1796 minproj = _mm_min_ss(minproj, proj); \
1797 maxproj = _mm_max_ss(maxproj, proj); \
1799 if (!(clipmask&(1<<(k^4)))) \
1801 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1802 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1803 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1804 minproj = _mm_min_ss(minproj, proj); \
1805 maxproj = _mm_max_ss(maxproj, proj); \
1809 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1810 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1811 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1812 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1813 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1814 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1815 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1816 *starty = _mm_cvttss_si32(maxproj);
1817 *endy = _mm_cvttss_si32(minproj)+1;
1821 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1824 float *end = out4f + numitems*4;
1825 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1826 __m128 minpos, maxpos;
1827 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829 minpos = maxpos = _mm_loadu_ps(in4f);
1832 __m128 v = _mm_loadu_ps(in4f);
1833 minpos = _mm_min_ps(minpos, v);
1834 maxpos = _mm_max_ps(maxpos, v);
1835 _mm_store_ps(out4f, v);
1836 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837 _mm_store_ps(screen4f, v);
1845 minpos = maxpos = _mm_load_ps(in4f);
1848 __m128 v = _mm_load_ps(in4f);
1849 minpos = _mm_min_ps(minpos, v);
1850 maxpos = _mm_max_ps(maxpos, v);
1851 _mm_store_ps(out4f, v);
1852 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853 _mm_store_ps(screen4f, v);
1861 ALIGN(float minposf[4]);
1862 ALIGN(float maxposf[4]);
1863 _mm_store_ps(minposf, minpos);
1864 _mm_store_ps(maxposf, maxpos);
1865 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1870 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1873 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1876 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1877 end = out4f + numitems*4;
1878 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1879 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1880 m0 = _mm_loadu_ps(inmatrix16f);
1881 m1 = _mm_loadu_ps(inmatrix16f + 4);
1882 m2 = _mm_loadu_ps(inmatrix16f + 8);
1883 m3 = _mm_loadu_ps(inmatrix16f + 12);
1884 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886 minpos = maxpos = _mm_loadu_ps(in4f);
1889 __m128 v = _mm_loadu_ps(in4f);
1890 minpos = _mm_min_ps(minpos, v);
1891 maxpos = _mm_max_ps(maxpos, v);
1892 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1893 _mm_store_ps(out4f, v);
1894 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1895 _mm_store_ps(screen4f, v);
1903 minpos = maxpos = _mm_load_ps(in4f);
1906 __m128 v = _mm_load_ps(in4f);
1907 minpos = _mm_min_ps(minpos, v);
1908 maxpos = _mm_max_ps(maxpos, v);
1909 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1910 _mm_store_ps(out4f, v);
1911 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1912 _mm_store_ps(screen4f, v);
1920 ALIGN(float minposf[4]);
1921 ALIGN(float maxposf[4]);
1922 _mm_store_ps(minposf, minpos);
1923 _mm_store_ps(maxposf, maxpos);
1924 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1930 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1933 float *outf = dpsoftrast.post_array4f[outarray];
1934 const unsigned char *inb;
1935 int firstvertex = dpsoftrast.firstvertex;
1936 int numvertices = dpsoftrast.numvertices;
1940 case DPSOFTRAST_ARRAY_POSITION:
1941 stride = dpsoftrast.stride_vertex;
1942 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1943 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945 case DPSOFTRAST_ARRAY_COLOR:
1946 stride = dpsoftrast.stride_color;
1947 if (dpsoftrast.pointer_color4f)
1949 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1950 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952 else if (dpsoftrast.pointer_color4ub)
1954 stride = dpsoftrast.stride_color;
1955 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1956 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1960 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1964 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1965 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1968 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1971 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1974 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1977 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1989 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1997 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2000 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2001 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2009 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2012 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2013 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2020 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2023 int startx = span->startx;
2024 int endx = span->endx;
2025 float wslope = triangle->w[0];
2026 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2027 float endz = 1.0f / (w + wslope * startx);
2028 if (triangle->w[0] == 0)
2030 // LordHavoc: fast flat polygons (HUD/menu)
2031 for (x = startx;x < endx;x++)
2035 for (x = startx;x < endx;)
2037 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039 if (nextsub >= endx) nextsub = endsub = endx-1;
2040 endz = 1.0f / (w + wslope * nextsub);
2041 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2042 for (; x <= endsub; x++, z += dz)
2047 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2051 int startx = span->startx;
2052 int endx = span->endx;
2055 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2056 unsigned char * RESTRICT pixelmask = span->pixelmask;
2057 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2058 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2061 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2062 pixeli += span->y * dpsoftrast.fb_width + span->x;
2063 // handle alphatest now (this affects depth writes too)
2064 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2065 for (x = startx;x < endx;x++)
2066 if (in4ub[x*4+3] < 128)
2067 pixelmask[x] = false;
2068 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2069 // helps sprites, text and hud artwork
2070 switch(thread->fb_blendmode)
2072 case DPSOFTRAST_BLENDMODE_ALPHA:
2073 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2074 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076 for (x = startx;x < endx;x++)
2078 if (in4ub[x*4+3] >= 1)
2083 while (++x < endx && in4ub[x*4+3] >= 1) ;
2085 if (x >= endx) break;
2087 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2088 if (x >= endx) break;
2095 case DPSOFTRAST_BLENDMODE_OPAQUE:
2096 case DPSOFTRAST_BLENDMODE_ADD:
2097 case DPSOFTRAST_BLENDMODE_INVMOD:
2098 case DPSOFTRAST_BLENDMODE_MUL:
2099 case DPSOFTRAST_BLENDMODE_MUL2:
2100 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2101 case DPSOFTRAST_BLENDMODE_INVADD:
2104 // put some special values at the end of the mask to ensure the loops end
2105 pixelmask[endx] = 1;
2106 pixelmask[endx+1] = 0;
2107 // LordHavoc: use a double loop to identify subspans, this helps the
2108 // optimized copy/blend loops to perform at their best, most triangles
2109 // have only one run of pixels, and do the search using wide reads...
2113 // if this pixel is masked off, it's probably not alone...
2120 // the 4-item search must be aligned or else it stalls badly
2121 if ((x & 3) && !pixelmask[x])
2123 if(pixelmask[x]) goto endmasked;
2127 if(pixelmask[x]) goto endmasked;
2131 if(pixelmask[x]) goto endmasked;
2136 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2140 for (;!pixelmask[x];x++)
2142 // rather than continue the loop, just check the end variable
2147 // find length of subspan
2150 if (subx + 8 < endx)
2154 if(!pixelmask[subx]) goto endunmasked;
2158 if(!pixelmask[subx]) goto endunmasked;
2162 if(!pixelmask[subx]) goto endunmasked;
2167 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2171 for (;pixelmask[subx];subx++)
2173 // the checks can overshoot, so make sure to clip it...
2177 // now that we know the subspan length... process!
2178 switch(thread->fb_blendmode)
2180 case DPSOFTRAST_BLENDMODE_OPAQUE:
2184 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2189 while (x + 16 <= subx)
2191 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2192 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2199 while (x + 4 <= subx)
2201 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2207 pixeli[x+1] = ini[x+1];
2217 case DPSOFTRAST_BLENDMODE_ALPHA:
2218 #define FINISHBLEND(blend2, blend1) \
2219 for (;x + 1 < subx;x += 2) \
2222 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2223 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2230 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2231 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2237 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2244 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2253 case DPSOFTRAST_BLENDMODE_ADD:
2254 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256 case DPSOFTRAST_BLENDMODE_INVMOD:
2258 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2263 case DPSOFTRAST_BLENDMODE_MUL:
2264 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266 case DPSOFTRAST_BLENDMODE_MUL2:
2267 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2278 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2281 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2284 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2287 case DPSOFTRAST_BLENDMODE_INVADD:
2289 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2299 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2300 // warning: this is SLOW, only use if the optimized per-span functions won't do
2302 const unsigned char * RESTRICT pixelbase;
2303 const unsigned char * RESTRICT pixel[4];
2304 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2305 int wrapmask[2] = { width-1, height-1 };
2306 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2307 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2309 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2310 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2311 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2312 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2313 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2314 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2315 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2317 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2318 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2319 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2320 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2324 tci[0] &= wrapmask[0];
2325 tci[1] &= wrapmask[1];
2326 tci1[0] &= wrapmask[0];
2327 tci1[1] &= wrapmask[1];
2329 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2330 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2332 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2334 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2335 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2336 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2340 int tci[2] = { x * width, y * height };
2341 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2343 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2344 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2348 tci[0] &= wrapmask[0];
2349 tci[1] &= wrapmask[1];
2351 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2360 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2363 int startx = span->startx;
2364 int endx = span->endx;
2369 float tc[2], endtc[2];
2371 unsigned int tci[2];
2372 unsigned int tci1[2];
2373 unsigned int tcimin[2];
2374 unsigned int tcimax[2];
2379 const unsigned char * RESTRICT pixelbase;
2380 const unsigned char * RESTRICT pixel[4];
2381 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2382 // if no texture is bound, just fill it with white
2385 for (x = startx;x < endx;x++)
2387 out4f[x*4+0] = 1.0f;
2388 out4f[x*4+1] = 1.0f;
2389 out4f[x*4+2] = 1.0f;
2390 out4f[x*4+3] = 1.0f;
2394 mip = triangle->mip[texunitindex];
2395 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2396 // if this mipmap of the texture is 1 pixel, just fill it with that color
2397 if (texture->mipmap[mip][1] == 4)
2399 c[0] = texture->bytes[2] * (1.0f/255.0f);
2400 c[1] = texture->bytes[1] * (1.0f/255.0f);
2401 c[2] = texture->bytes[0] * (1.0f/255.0f);
2402 c[3] = texture->bytes[3] * (1.0f/255.0f);
2403 for (x = startx;x < endx;x++)
2405 out4f[x*4+0] = c[0];
2406 out4f[x*4+1] = c[1];
2407 out4f[x*4+2] = c[2];
2408 out4f[x*4+3] = c[3];
2412 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2413 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2414 flags = texture->flags;
2415 tcscale[0] = texture->mipmap[mip][2];
2416 tcscale[1] = texture->mipmap[mip][3];
2417 tciwidth = -texture->mipmap[mip][2];
2420 tcimax[0] = texture->mipmap[mip][2]-1;
2421 tcimax[1] = texture->mipmap[mip][3]-1;
2422 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2423 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2424 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2425 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2431 for (x = startx;x < endx;)
2433 unsigned int subtc[2];
2434 unsigned int substep[2];
2435 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2436 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2437 if (nextsub >= endx)
2439 nextsub = endsub = endx-1;
2440 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2444 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2445 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2451 substep[0] = (endtc[0] - tc[0]) * subscale;
2452 substep[1] = (endtc[1] - tc[1]) * subscale;
2453 subtc[0] = tc[0] * (1<<12);
2454 subtc[1] = tc[1] * (1<<12);
2457 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2459 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2461 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2462 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2463 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2464 tci[0] = subtc[0]>>12;
2465 tci[1] = subtc[1]>>12;
2466 tci1[0] = tci[0] + 1;
2467 tci1[1] = tci[1] + 1;
2468 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2469 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2470 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2471 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2472 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2473 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2474 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2475 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2476 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2477 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2479 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2480 out4f[x*4+0] = c[0];
2481 out4f[x*4+1] = c[1];
2482 out4f[x*4+2] = c[2];
2483 out4f[x*4+3] = c[3];
2488 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2490 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2491 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2492 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2493 tci[0] = subtc[0]>>12;
2494 tci[1] = subtc[1]>>12;
2495 tci1[0] = tci[0] + 1;
2496 tci1[1] = tci[1] + 1;
2497 tci[0] &= tciwrapmask[0];
2498 tci[1] &= tciwrapmask[1];
2499 tci1[0] &= tciwrapmask[0];
2500 tci1[1] &= tciwrapmask[1];
2501 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2502 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2503 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2504 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2505 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2506 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2507 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2508 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2509 out4f[x*4+0] = c[0];
2510 out4f[x*4+1] = c[1];
2511 out4f[x*4+2] = c[2];
2512 out4f[x*4+3] = c[3];
2516 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520 tci[0] = subtc[0]>>12;
2521 tci[1] = subtc[1]>>12;
2522 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2523 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2524 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2525 c[0] = pixel[0][2] * (1.0f / 255.0f);
2526 c[1] = pixel[0][1] * (1.0f / 255.0f);
2527 c[2] = pixel[0][0] * (1.0f / 255.0f);
2528 c[3] = pixel[0][3] * (1.0f / 255.0f);
2529 out4f[x*4+0] = c[0];
2530 out4f[x*4+1] = c[1];
2531 out4f[x*4+2] = c[2];
2532 out4f[x*4+3] = c[3];
2537 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2539 tci[0] = subtc[0]>>12;
2540 tci[1] = subtc[1]>>12;
2541 tci[0] &= tciwrapmask[0];
2542 tci[1] &= tciwrapmask[1];
2543 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2544 c[0] = pixel[0][2] * (1.0f / 255.0f);
2545 c[1] = pixel[0][1] * (1.0f / 255.0f);
2546 c[2] = pixel[0][0] * (1.0f / 255.0f);
2547 c[3] = pixel[0][3] * (1.0f / 255.0f);
2548 out4f[x*4+0] = c[0];
2549 out4f[x*4+1] = c[1];
2550 out4f[x*4+2] = c[2];
2551 out4f[x*4+3] = c[3];
2558 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2562 int startx = span->startx;
2563 int endx = span->endx;
2565 __m128 data, slope, tcscale;
2566 __m128i tcsize, tcmask, tcoffset, tcmax;
2568 __m128i subtc, substep, endsubtc;
2571 int affine; // LordHavoc: optimized affine texturing case
2572 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2573 const unsigned char * RESTRICT pixelbase;
2574 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2575 // if no texture is bound, just fill it with white
2578 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2581 mip = triangle->mip[texunitindex];
2582 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2583 // if this mipmap of the texture is 1 pixel, just fill it with that color
2584 if (texture->mipmap[mip][1] == 4)
2586 unsigned int k = *((const unsigned int *)pixelbase);
2587 for (x = startx;x < endx;x++)
2591 affine = zf[startx] == zf[endx-1];
2592 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2593 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2594 flags = texture->flags;
2595 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2596 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2597 tcscale = _mm_cvtepi32_ps(tcsize);
2598 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2599 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2600 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2602 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2603 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2604 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2605 tcmax = _mm_packs_epi32(tcmask, tcmask);
2606 for (x = startx;x < endx;)
2608 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2609 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2610 if (nextsub >= endx || affine)
2612 nextsub = endsub = endx-1;
2613 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2617 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2619 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2620 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2621 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2622 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2623 substep = _mm_slli_epi32(substep, 1);
2626 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2627 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2629 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2630 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2632 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2633 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2634 tci = _mm_madd_epi16(tci, tcoffset);
2635 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2636 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2637 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2638 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2639 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2640 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2641 fracm = _mm_srli_epi16(subtc, 1);
2642 pix1 = _mm_add_epi16(pix1,
2643 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645 pix3 = _mm_add_epi16(pix3,
2646 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2647 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2648 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2649 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2650 pix2 = _mm_add_epi16(pix2,
2651 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2652 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2653 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2657 const unsigned char * RESTRICT ptr1;
2658 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2661 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2662 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2663 fracm = _mm_srli_epi16(subtc, 1);
2664 pix1 = _mm_add_epi16(pix1,
2665 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2666 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2667 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2668 pix1 = _mm_add_epi16(pix1,
2669 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2671 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2675 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2677 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2679 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2680 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2681 tci = _mm_madd_epi16(tci, tcoffset);
2682 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684 _mm_setzero_si128());
2685 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687 _mm_setzero_si128());
2688 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2689 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2690 tci = _mm_madd_epi16(tci, tcoffset);
2691 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2692 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2693 _mm_setzero_si128());
2694 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2695 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2696 _mm_setzero_si128());
2697 fracm = _mm_srli_epi16(subtc, 1);
2698 pix1 = _mm_add_epi16(pix1,
2699 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2701 pix3 = _mm_add_epi16(pix3,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2703 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2704 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2705 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2706 pix2 = _mm_add_epi16(pix2,
2707 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2708 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2709 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2713 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2714 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2715 tci = _mm_madd_epi16(tci, tcoffset);
2716 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2717 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2718 _mm_setzero_si128());
2719 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2720 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2721 _mm_setzero_si128());
2722 fracm = _mm_srli_epi16(subtc, 1);
2723 pix1 = _mm_add_epi16(pix1,
2724 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2725 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2726 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2727 pix1 = _mm_add_epi16(pix1,
2728 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2729 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2730 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2736 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2739 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2742 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2743 _mm_setzero_si128());
2744 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2745 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2746 _mm_setzero_si128());
2747 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2748 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2749 tci = _mm_madd_epi16(tci, tcoffset);
2750 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2751 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2752 _mm_setzero_si128());
2753 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2754 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2755 _mm_setzero_si128());
2756 fracm = _mm_srli_epi16(subtc, 1);
2757 pix1 = _mm_add_epi16(pix1,
2758 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2759 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2760 pix3 = _mm_add_epi16(pix3,
2761 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2762 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2763 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2764 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2765 pix2 = _mm_add_epi16(pix2,
2766 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2767 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2768 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2772 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2773 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2774 tci = _mm_madd_epi16(tci, tcoffset);
2775 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2776 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2777 _mm_setzero_si128());
2778 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2779 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2780 _mm_setzero_si128());
2781 fracm = _mm_srli_epi16(subtc, 1);
2782 pix1 = _mm_add_epi16(pix1,
2783 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2784 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2785 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2786 pix1 = _mm_add_epi16(pix1,
2787 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2788 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2789 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2796 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2798 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2800 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2801 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2802 tci = _mm_madd_epi16(tci, tcoffset);
2803 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2804 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2808 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2809 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2810 tci = _mm_madd_epi16(tci, tcoffset);
2811 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2817 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2819 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2820 tci = _mm_and_si128(tci, tcmax);
2821 tci = _mm_madd_epi16(tci, tcoffset);
2822 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2823 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2827 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2828 tci = _mm_and_si128(tci, tcmax);
2829 tci = _mm_madd_epi16(tci, tcoffset);
2830 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2839 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2842 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2845 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2852 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2855 int startx = span->startx;
2856 int endx = span->endx;
2861 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2862 for (x = startx;x < endx;x++)
2865 c[0] = (data[0] + slope[0]*x) * z;
2866 c[1] = (data[1] + slope[1]*x) * z;
2867 c[2] = (data[2] + slope[2]*x) * z;
2868 c[3] = (data[3] + slope[3]*x) * z;
2869 out4f[x*4+0] = in4f[x*4+0] * c[0];
2870 out4f[x*4+1] = in4f[x*4+1] * c[1];
2871 out4f[x*4+2] = in4f[x*4+2] * c[2];
2872 out4f[x*4+3] = in4f[x*4+3] * c[3];
2878 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2881 int startx = span->startx;
2882 int endx = span->endx;
2887 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2888 for (x = startx;x < endx;x++)
2891 c[0] = (data[0] + slope[0]*x) * z;
2892 c[1] = (data[1] + slope[1]*x) * z;
2893 c[2] = (data[2] + slope[2]*x) * z;
2894 c[3] = (data[3] + slope[3]*x) * z;
2895 out4f[x*4+0] = c[0];
2896 out4f[x*4+1] = c[1];
2897 out4f[x*4+2] = c[2];
2898 out4f[x*4+3] = c[3];
2904 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2906 int x, startx = span->startx, endx = span->endx;
2907 float c[4], localcolor[4];
2908 localcolor[0] = subcolor[0];
2909 localcolor[1] = subcolor[1];
2910 localcolor[2] = subcolor[2];
2911 localcolor[3] = subcolor[3];
2912 for (x = startx;x < endx;x++)
2914 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2915 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2916 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2917 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2918 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2919 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2920 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2921 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2927 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2929 int x, startx = span->startx, endx = span->endx;
2930 for (x = startx;x < endx;x++)
2932 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2933 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2934 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2935 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2941 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2943 int x, startx = span->startx, endx = span->endx;
2944 for (x = startx;x < endx;x++)
2946 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2947 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2948 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2949 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2955 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2957 int x, startx = span->startx, endx = span->endx;
2959 for (x = startx;x < endx;x++)
2961 a = 1.0f - inb4f[x*4+3];
2963 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2964 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2965 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2966 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2972 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2974 int x, startx = span->startx, endx = span->endx;
2975 float localcolor[4], ilerp, lerp;
2976 localcolor[0] = color[0];
2977 localcolor[1] = color[1];
2978 localcolor[2] = color[2];
2979 localcolor[3] = color[3];
2980 ilerp = 1.0f - localcolor[3];
2981 lerp = localcolor[3];
2982 for (x = startx;x < endx;x++)
2984 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2985 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2986 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2987 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2994 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2998 int startx = span->startx;
2999 int endx = span->endx;
3002 __m128i submod, substep, endsubmod;
3003 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3004 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3005 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3006 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3007 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3008 for (x = startx; x < endx;)
3010 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3011 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3012 if (nextsub >= endx)
3014 nextsub = endsub = endx-1;
3015 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3019 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3020 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3021 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3022 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3023 substep = _mm_packs_epi32(substep, substep);
3024 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3026 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3027 pix = _mm_mulhi_epu16(pix, submod);
3028 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3032 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3033 pix = _mm_mulhi_epu16(pix, submod);
3034 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3041 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3045 int startx = span->startx;
3046 int endx = span->endx;
3049 __m128i submod, substep, endsubmod;
3050 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3051 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3052 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3053 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3054 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3055 for (x = startx; x < endx;)
3057 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3058 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3059 if (nextsub >= endx)
3061 nextsub = endsub = endx-1;
3062 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3066 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3067 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3068 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3069 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3070 substep = _mm_packs_epi32(substep, substep);
3071 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3073 __m128i pix = _mm_srai_epi16(submod, 4);
3074 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3078 __m128i pix = _mm_srai_epi16(submod, 4);
3079 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3086 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3089 int x, startx = span->startx, endx = span->endx;
3090 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3091 localcolor = _mm_packs_epi32(localcolor, localcolor);
3092 for (x = startx;x+2 <= endx;x+=2)
3094 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3095 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3096 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3097 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3101 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3102 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3103 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3104 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3109 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3112 int x, startx = span->startx, endx = span->endx;
3113 for (x = startx;x+2 <= endx;x+=2)
3115 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3116 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3117 pix1 = _mm_mulhi_epu16(pix1, pix2);
3118 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3122 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3123 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3124 pix1 = _mm_mulhi_epu16(pix1, pix2);
3125 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3130 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3133 int x, startx = span->startx, endx = span->endx;
3134 for (x = startx;x+2 <= endx;x+=2)
3136 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3137 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3138 pix1 = _mm_add_epi16(pix1, pix2);
3139 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3143 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145 pix1 = _mm_add_epi16(pix1, pix2);
3146 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3152 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3155 int x, startx = span->startx, endx = span->endx;
3156 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3157 tint = _mm_packs_epi32(tint, tint);
3158 for (x = startx;x+2 <= endx;x+=2)
3160 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3161 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3162 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3163 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3167 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3168 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3169 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3170 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3176 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3179 int x, startx = span->startx, endx = span->endx;
3180 for (x = startx;x+2 <= endx;x+=2)
3182 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3183 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3184 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3185 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3186 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3190 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3191 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3192 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3193 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3194 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3199 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3202 int x, startx = span->startx, endx = span->endx;
3203 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3204 localcolor = _mm_packs_epi32(localcolor, localcolor);
3205 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3206 for (x = startx;x+2 <= endx;x+=2)
3208 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3209 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3210 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3214 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3215 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3216 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3223 static void DPSOFTRAST_VertexShader_Generic(void)
3225 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3227 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3228 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3229 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3232 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3234 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3235 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3239 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3242 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3243 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3245 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3246 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3249 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3254 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3259 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3264 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3265 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3270 static void DPSOFTRAST_VertexShader_PostProcess(void)
3272 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3273 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3274 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3277 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3279 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3280 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3281 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3285 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3287 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3288 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3290 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3291 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3293 // TODO: implement saturation
3295 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3297 // TODO: implement gammaramps
3299 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3304 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3306 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3309 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3311 // this is never called (because colormask is off when this shader is used)
3312 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3313 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3314 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3315 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3316 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3321 static void DPSOFTRAST_VertexShader_FlatColor(void)
3323 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3324 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3327 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3330 unsigned char * RESTRICT pixelmask = span->pixelmask;
3331 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3332 int x, startx = span->startx, endx = span->endx;
3333 __m128i Color_Ambientm;
3334 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3335 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3336 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3338 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3339 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3340 pixel = buffer_FragColorbgra8;
3341 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3342 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3343 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3344 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3345 for (x = startx;x < endx;x++)
3348 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3351 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3352 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3353 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3354 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3360 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3361 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3362 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3364 if (pixel == buffer_FragColorbgra8)
3365 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3371 static void DPSOFTRAST_VertexShader_VertexColor(void)
3373 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3374 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3375 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3378 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 unsigned char * RESTRICT pixelmask = span->pixelmask;
3382 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3383 int x, startx = span->startx, endx = span->endx;
3384 __m128i Color_Ambientm, Color_Diffusem;
3386 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3387 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3388 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3390 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3391 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3392 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3393 pixel = buffer_FragColorbgra8;
3394 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3395 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3396 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3397 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3398 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3399 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3400 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3401 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3402 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3403 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3404 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3405 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3406 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3407 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3409 __m128i color, mod, pix;
3410 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3414 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3415 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3416 data = _mm_add_ps(data, slope);
3417 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3418 data = _mm_add_ps(data, slope);
3419 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3420 data = _mm_add_ps(data, slope);
3421 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3422 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3423 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3424 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3425 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3426 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3432 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3433 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3434 mod = _mm_packs_epi32(mod, mod);
3435 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3436 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3438 if (pixel == buffer_FragColorbgra8)
3439 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3445 static void DPSOFTRAST_VertexShader_Lightmap(void)
3447 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3448 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3449 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3452 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 unsigned char * RESTRICT pixelmask = span->pixelmask;
3456 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3457 int x, startx = span->startx, endx = span->endx;
3458 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3459 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3460 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3461 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3462 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3465 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3466 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3467 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3468 pixel = buffer_FragColorbgra8;
3469 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3470 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3471 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3472 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3473 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3474 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3475 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3476 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3478 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3479 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3480 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3481 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3482 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3483 for (x = startx;x < endx;x++)
3485 __m128i color, lightmap, glow, pix;
3486 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3489 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3490 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3491 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3492 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3493 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3494 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3495 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3496 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3497 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3498 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3504 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3505 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3506 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3507 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3508 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3509 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3514 for (x = startx;x < endx;x++)
3516 __m128i color, lightmap, pix;
3517 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3520 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3521 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3522 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3523 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3524 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3525 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3526 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3532 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3533 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3534 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3535 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3538 if (pixel == buffer_FragColorbgra8)
3539 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3544 void DPSOFTRAST_VertexShader_LightDirection(void);
3545 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3547 static void DPSOFTRAST_VertexShader_FakeLight(void)
3549 DPSOFTRAST_VertexShader_LightDirection();
3552 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3554 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3559 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3561 DPSOFTRAST_VertexShader_LightDirection();
3562 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3565 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3567 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3572 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3574 DPSOFTRAST_VertexShader_LightDirection();
3575 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3578 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3580 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3585 void DPSOFTRAST_VertexShader_LightDirection(void)
3588 int numvertices = dpsoftrast.numvertices;
3590 float LightVector[4];
3591 float EyePosition[4];
3592 float EyeVectorModelSpace[4];
3598 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3599 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3600 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3601 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3602 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3603 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3604 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3605 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3606 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3607 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3608 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3609 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3610 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3611 for (i = 0;i < numvertices;i++)
3613 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3614 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3615 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3616 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3617 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3618 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3619 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3620 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3621 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3622 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3623 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3624 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3625 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3626 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3627 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3628 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3629 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3632 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3633 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3634 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3635 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3636 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3637 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3638 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3639 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3640 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3641 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3643 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3646 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3647 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3648 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3649 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3650 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3651 #define DPSOFTRAST_Vector3Normalize(v)\
3654 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3665 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3667 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3668 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677 int x, startx = span->startx, endx = span->endx;
3678 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3679 float LightVectordata[4];
3680 float LightVectorslope[4];
3681 float EyeVectordata[4];
3682 float EyeVectorslope[4];
3683 float VectorSdata[4];
3684 float VectorSslope[4];
3685 float VectorTdata[4];
3686 float VectorTslope[4];
3687 float VectorRdata[4];
3688 float VectorRslope[4];
3690 float diffusetex[4];
3692 float surfacenormal[4];
3693 float lightnormal[4];
3694 float lightnormal_modelspace[4];
3696 float specularnormal[4];
3699 float SpecularPower;
3701 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3702 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3703 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3704 Color_Glow[3] = 0.0f;
3705 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3706 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3707 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3708 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3709 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3710 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3711 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3712 Color_Pants[3] = 0.0f;
3713 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3714 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3715 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3716 Color_Shirt[3] = 0.0f;
3717 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3718 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3721 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3722 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3726 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3728 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3730 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3731 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3732 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3733 Color_Diffuse[3] = 0.0f;
3734 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3735 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3736 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3737 LightColor[3] = 0.0f;
3738 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3739 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3740 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3741 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3742 Color_Specular[3] = 0.0f;
3743 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3744 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3745 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3747 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3749 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3750 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3751 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3752 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3753 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3757 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3758 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3762 // nothing of this needed
3766 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3769 for (x = startx;x < endx;x++)
3772 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3773 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3774 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3775 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3776 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3778 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3779 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3780 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3781 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3783 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3784 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3785 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3786 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3787 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3788 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3789 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3790 DPSOFTRAST_Vector3Normalize(surfacenormal);
3792 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3794 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3795 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3796 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3797 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3799 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3800 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3801 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3802 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3804 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3805 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3806 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3807 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3809 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3810 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3811 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3812 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3814 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3815 DPSOFTRAST_Vector3Normalize(lightnormal);
3817 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3819 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3820 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3821 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3822 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3825 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3827 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3828 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3829 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3831 float f = 1.0f / 256.0f;
3832 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3833 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3834 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3837 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3839 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3840 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3841 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3842 DPSOFTRAST_Vector3Normalize(lightnormal);
3844 LightColor[0] = 1.0;
3845 LightColor[1] = 1.0;
3846 LightColor[2] = 1.0;
3850 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3851 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3852 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3853 DPSOFTRAST_Vector3Normalize(lightnormal);
3856 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3858 if(thread->shader_exactspecularmath)
3860 // reflect lightnormal at surfacenormal, take the negative of that
3861 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3863 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3864 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3865 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3866 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3868 // dot of this and normalize(EyeVectorFogDepth.xyz)
3869 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3870 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3871 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3872 DPSOFTRAST_Vector3Normalize(eyenormal);
3874 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3878 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3879 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3880 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3881 DPSOFTRAST_Vector3Normalize(eyenormal);
3883 specularnormal[0] = lightnormal[0] + eyenormal[0];
3884 specularnormal[1] = lightnormal[1] + eyenormal[1];
3885 specularnormal[2] = lightnormal[2] + eyenormal[2];
3886 DPSOFTRAST_Vector3Normalize(specularnormal);
3888 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3890 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3892 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3894 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3895 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3896 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3897 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3901 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3902 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3903 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3904 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3907 buffer_FragColorbgra8[x*4+0] = d[0];
3908 buffer_FragColorbgra8[x*4+1] = d[1];
3909 buffer_FragColorbgra8[x*4+2] = d[2];
3910 buffer_FragColorbgra8[x*4+3] = d[3];
3913 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3915 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3916 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3917 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3918 Color_Diffuse[3] = 0.0f;
3919 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3920 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3921 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3922 LightColor[3] = 0.0f;
3923 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3927 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3928 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3929 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3930 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3931 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3935 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3936 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3940 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3944 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3947 for (x = startx;x < endx;x++)
3950 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3951 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3952 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3953 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3954 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3955 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3956 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3957 DPSOFTRAST_Vector3Normalize(surfacenormal);
3959 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3961 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3962 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3963 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3964 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3966 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3967 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3968 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3969 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3971 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3972 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3973 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3974 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3976 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3977 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3978 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3979 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3981 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3982 DPSOFTRAST_Vector3Normalize(lightnormal);
3984 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3986 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3987 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3988 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3989 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3992 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3994 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3995 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3996 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3998 float f = 1.0f / 256.0f;
3999 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4000 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4001 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4004 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4006 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4007 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4008 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4009 DPSOFTRAST_Vector3Normalize(lightnormal);
4011 LightColor[0] = 1.0;
4012 LightColor[1] = 1.0;
4013 LightColor[2] = 1.0;
4017 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4018 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4019 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4020 DPSOFTRAST_Vector3Normalize(lightnormal);
4023 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4024 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4026 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4027 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4028 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4029 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4033 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4034 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4035 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4036 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4038 buffer_FragColorbgra8[x*4+0] = d[0];
4039 buffer_FragColorbgra8[x*4+1] = d[1];
4040 buffer_FragColorbgra8[x*4+2] = d[2];
4041 buffer_FragColorbgra8[x*4+3] = d[3];
4046 for (x = startx;x < endx;x++)
4049 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4050 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4051 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4052 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4054 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4056 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4057 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4058 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4059 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4063 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4064 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4065 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4066 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4068 buffer_FragColorbgra8[x*4+0] = d[0];
4069 buffer_FragColorbgra8[x*4+1] = d[1];
4070 buffer_FragColorbgra8[x*4+2] = d[2];
4071 buffer_FragColorbgra8[x*4+3] = d[3];
4074 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4079 static void DPSOFTRAST_VertexShader_LightSource(void)
4082 int numvertices = dpsoftrast.numvertices;
4083 float LightPosition[4];
4084 float LightVector[4];
4085 float LightVectorModelSpace[4];
4086 float EyePosition[4];
4087 float EyeVectorModelSpace[4];
4093 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4094 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4095 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4096 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4097 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4098 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4099 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4100 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4101 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4102 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4103 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4104 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4105 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4106 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4107 for (i = 0;i < numvertices;i++)
4109 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4110 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4111 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4112 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4113 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4114 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4115 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4116 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4117 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4118 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4119 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4120 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4121 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4122 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4123 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4124 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4125 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4126 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4127 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4128 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4131 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4132 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4133 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4134 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4135 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4136 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4137 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4138 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4139 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4140 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4142 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4143 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4146 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4149 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4150 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157 int x, startx = span->startx, endx = span->endx;
4158 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4159 float CubeVectordata[4];
4160 float CubeVectorslope[4];
4161 float LightVectordata[4];
4162 float LightVectorslope[4];
4163 float EyeVectordata[4];
4164 float EyeVectorslope[4];
4166 float diffusetex[4];
4168 float surfacenormal[4];
4169 float lightnormal[4];
4171 float specularnormal[4];
4174 float SpecularPower;
4175 float CubeVector[4];
4178 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4179 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4180 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4181 Color_Glow[3] = 0.0f;
4182 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4183 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4184 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4185 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4186 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4187 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4188 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4189 Color_Diffuse[3] = 0.0f;
4190 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4191 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4192 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4193 Color_Specular[3] = 0.0f;
4194 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4195 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4196 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4197 Color_Pants[3] = 0.0f;
4198 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4199 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4200 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4201 Color_Shirt[3] = 0.0f;
4202 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4203 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4204 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4205 LightColor[3] = 0.0f;
4206 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4207 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4208 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4209 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4210 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4211 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4215 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4216 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4219 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4220 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4222 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4223 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4224 for (x = startx;x < endx;x++)
4227 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4228 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4229 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4230 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4231 if (attenuation < 0.01f)
4233 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4235 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4236 if (attenuation < 0.01f)
4240 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4241 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4242 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4243 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4244 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4246 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4247 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4248 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4249 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4251 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4252 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4253 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4254 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4255 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4256 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4257 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4258 DPSOFTRAST_Vector3Normalize(surfacenormal);
4260 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4261 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4262 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4263 DPSOFTRAST_Vector3Normalize(lightnormal);
4265 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4267 if(thread->shader_exactspecularmath)
4269 // reflect lightnormal at surfacenormal, take the negative of that
4270 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4272 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4273 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4274 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4275 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4277 // dot of this and normalize(EyeVectorFogDepth.xyz)
4278 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4279 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4280 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4281 DPSOFTRAST_Vector3Normalize(eyenormal);
4283 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4287 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4288 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4289 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4290 DPSOFTRAST_Vector3Normalize(eyenormal);
4292 specularnormal[0] = lightnormal[0] + eyenormal[0];
4293 specularnormal[1] = lightnormal[1] + eyenormal[1];
4294 specularnormal[2] = lightnormal[2] + eyenormal[2];
4295 DPSOFTRAST_Vector3Normalize(specularnormal);
4297 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4299 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4301 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4303 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4304 attenuation *= (1.0f / 255.0f);
4305 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4306 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4307 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4308 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4312 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4313 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4314 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4315 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4317 buffer_FragColorbgra8[x*4+0] = d[0];
4318 buffer_FragColorbgra8[x*4+1] = d[1];
4319 buffer_FragColorbgra8[x*4+2] = d[2];
4320 buffer_FragColorbgra8[x*4+3] = d[3];
4323 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4325 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4326 for (x = startx;x < endx;x++)
4329 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4330 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4331 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4332 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4333 if (attenuation < 0.01f)
4335 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4337 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4338 if (attenuation < 0.01f)
4342 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4343 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4344 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4345 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4346 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4348 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4349 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4350 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4351 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4353 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4354 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4355 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4356 DPSOFTRAST_Vector3Normalize(surfacenormal);
4358 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4359 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4360 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4361 DPSOFTRAST_Vector3Normalize(lightnormal);
4363 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4364 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4366 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4367 attenuation *= (1.0f / 255.0f);
4368 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4369 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4370 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4371 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4375 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4376 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4377 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4378 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4380 buffer_FragColorbgra8[x*4+0] = d[0];
4381 buffer_FragColorbgra8[x*4+1] = d[1];
4382 buffer_FragColorbgra8[x*4+2] = d[2];
4383 buffer_FragColorbgra8[x*4+3] = d[3];
4388 for (x = startx;x < endx;x++)
4391 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4392 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4393 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4394 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4395 if (attenuation < 0.01f)
4397 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4399 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4400 if (attenuation < 0.01f)
4404 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4405 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4406 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4407 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4408 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4410 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4411 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4412 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4413 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4415 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4417 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4418 attenuation *= (1.0f / 255.0f);
4419 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4420 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4421 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4422 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4426 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4427 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4428 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4429 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4431 buffer_FragColorbgra8[x*4+0] = d[0];
4432 buffer_FragColorbgra8[x*4+1] = d[1];
4433 buffer_FragColorbgra8[x*4+2] = d[2];
4434 buffer_FragColorbgra8[x*4+3] = d[3];
4437 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4443 static void DPSOFTRAST_VertexShader_Refraction(void)
4445 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4446 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4447 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4450 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4452 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4454 int x, startx = span->startx, endx = span->endx;
4457 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4458 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4461 float ModelViewProjectionPositiondata[4];
4462 float ModelViewProjectionPositionslope[4];
4465 float ScreenScaleRefractReflect[2];
4466 float ScreenCenterRefractReflect[2];
4467 float DistortScaleRefractReflect[2];
4468 float RefractColor[4];
4470 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4471 if(!texture) return;
4474 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4475 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4478 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4481 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4482 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4483 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4484 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4485 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4486 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4487 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4488 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4489 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4490 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4493 for (x = startx;x < endx;x++)
4495 float SafeScreenTexCoord[2];
4496 float ScreenTexCoord[2];
4503 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4504 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4506 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4507 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4508 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4510 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4511 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4512 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4513 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4514 DPSOFTRAST_Vector3Normalize(v);
4515 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4516 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4518 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4519 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4521 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4522 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4523 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4524 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4527 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4532 static void DPSOFTRAST_VertexShader_Water(void)
4535 int numvertices = dpsoftrast.numvertices;
4536 float EyePosition[4];
4537 float EyeVectorModelSpace[4];
4543 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4544 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4545 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4546 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4547 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4548 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4549 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4550 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4551 for (i = 0;i < numvertices;i++)
4553 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4554 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4555 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4556 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4557 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4558 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4559 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4560 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4561 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4562 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4563 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4564 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4565 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4566 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4567 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4568 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4569 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4570 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4571 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4572 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4573 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4574 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4576 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4577 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4578 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4582 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4584 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4586 int x, startx = span->startx, endx = span->endx;
4589 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4590 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4593 float ModelViewProjectionPositiondata[4];
4594 float ModelViewProjectionPositionslope[4];
4595 float EyeVectordata[4];
4596 float EyeVectorslope[4];
4599 float ScreenScaleRefractReflect[4];
4600 float ScreenCenterRefractReflect[4];
4601 float DistortScaleRefractReflect[4];
4602 float RefractColor[4];
4603 float ReflectColor[4];
4604 float ReflectFactor;
4605 float ReflectOffset;
4607 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4608 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4609 if(!texture_refraction || !texture_reflection) return;
4612 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4613 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4616 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4617 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4620 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4621 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4622 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4623 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4624 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4625 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4626 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4627 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4628 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4629 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4630 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4631 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4632 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4633 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4634 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4635 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4636 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4637 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4638 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4639 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4640 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4641 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4644 for (x = startx;x < endx;x++)
4646 float SafeScreenTexCoord[4];
4647 float ScreenTexCoord[4];
4650 unsigned char c1[4];
4651 unsigned char c2[4];
4656 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4657 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4659 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4660 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4661 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4662 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4663 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4665 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4666 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4667 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4668 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4669 DPSOFTRAST_Vector3Normalize(v);
4670 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4671 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4672 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4673 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4675 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4676 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4677 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4678 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4679 DPSOFTRAST_Vector3Normalize(v);
4680 Fresnel = 1.0f - v[2];
4681 Fresnel = min(1.0f, Fresnel);
4682 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4684 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4685 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4686 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4687 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4689 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4690 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4691 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4692 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4695 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4700 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4702 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4705 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4708 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4709 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4710 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4711 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4712 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4717 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4719 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4722 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4725 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4726 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4727 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4728 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4729 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4734 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4736 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4739 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4742 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4743 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4744 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4745 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4746 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4751 typedef struct DPSOFTRAST_ShaderModeInfo_s
4754 void (*Vertex)(void);
4755 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4756 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4757 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4759 DPSOFTRAST_ShaderModeInfo;
4761 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4763 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4764 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4765 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4766 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4767 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4768 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4769 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4770 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4771 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4772 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4773 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4774 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4775 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4776 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4777 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4778 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4779 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4780 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4783 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4788 unsigned int *depthpixel;
4792 unsigned char *pixelmask;
4793 DPSOFTRAST_State_Triangle *triangle;
4794 triangle = &thread->triangles[span->triangle];
4795 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4796 startx = span->startx;
4798 depth = span->depthbase;
4799 depthslope = span->depthslope;
4800 pixelmask = thread->pixelmaskarray;
4801 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4803 switch(thread->fb_depthfunc)
4806 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4807 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4808 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4809 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4810 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4811 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4812 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4814 while (startx < endx && !pixelmask[startx])
4816 while (endx > startx && !pixelmask[endx-1])
4821 // no depth testing means we're just dealing with color...
4822 memset(pixelmask + startx, 1, endx - startx);
4824 span->pixelmask = pixelmask;
4825 span->startx = startx;
4829 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4831 int x, d, depth, depthslope, startx, endx;
4832 const unsigned char *pixelmask;
4833 unsigned int *depthpixel;
4834 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4836 depth = span->depthbase;
4837 depthslope = span->depthslope;
4838 pixelmask = span->pixelmask;
4839 startx = span->startx;
4841 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4842 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4848 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4851 DPSOFTRAST_State_Triangle *triangle;
4852 DPSOFTRAST_State_Span *span;
4853 for (i = 0; i < thread->numspans; i++)
4855 span = &thread->spans[i];
4856 triangle = &thread->triangles[span->triangle];
4857 DPSOFTRAST_Draw_DepthTest(thread, span);
4858 if (span->startx >= span->endx)
4860 // run pixel shader if appropriate
4861 // do this before running depthmask code, to allow the pixelshader
4862 // to clear pixelmask values for alpha testing
4863 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4864 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4865 DPSOFTRAST_Draw_DepthWrite(thread, span);
4867 thread->numspans = 0;
4870 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4872 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4875 int cullface = thread->cullface;
4876 int minx, maxx, miny, maxy;
4877 int miny1, maxy1, miny2, maxy2;
4878 __m128i fbmin, fbmax;
4879 __m128 viewportcenter, viewportscale;
4880 int firstvertex = command->firstvertex;
4881 int numvertices = command->numvertices;
4882 int numtriangles = command->numtriangles;
4883 const int *element3i = command->element3i;
4884 const unsigned short *element3s = command->element3s;
4885 int clipped = command->clipped;
4892 int starty, endy, bandy;
4896 float clip0origin, clip0slope;
4898 __m128 triangleedge1, triangleedge2, trianglenormal;
4901 DPSOFTRAST_State_Triangle *triangle;
4902 DPSOFTRAST_Texture *texture;
4903 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4904 miny = thread->fb_scissor[1];
4905 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4906 miny1 = bound(miny, thread->miny1, maxy);
4907 maxy1 = bound(miny, thread->maxy1, maxy);
4908 miny2 = bound(miny, thread->miny2, maxy);
4909 maxy2 = bound(miny, thread->maxy2, maxy);
4910 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4912 if (!ATOMIC_DECREMENT(command->refcount))
4914 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4915 MM_FREE(command->arrays);
4919 minx = thread->fb_scissor[0];
4920 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4921 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4922 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4923 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4924 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4925 screen[3] = _mm_setzero_ps();
4926 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4927 for (i = 0;i < numtriangles;i++)
4929 const float *screencoord4f = command->arrays;
4930 const float *arrays = screencoord4f + numvertices*4;
4932 // generate the 3 edges of this triangle
4933 // generate spans for the triangle - switch based on left split or right split classification of triangle
4936 e[0] = element3s[i*3+0] - firstvertex;
4937 e[1] = element3s[i*3+1] - firstvertex;
4938 e[2] = element3s[i*3+2] - firstvertex;
4942 e[0] = element3i[i*3+0] - firstvertex;
4943 e[1] = element3i[i*3+1] - firstvertex;
4944 e[2] = element3i[i*3+2] - firstvertex;
4953 #define SKIPBACKFACE \
4954 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4955 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4956 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4957 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4958 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4962 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4966 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4971 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4972 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4974 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4977 #define CLIPPEDVERTEXCOPY(k,p1) \
4978 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4980 #define GENATTRIBCOPY(attrib, p1) \
4981 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4982 #define GENATTRIBLERP(attrib, p1, p2) \
4984 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4985 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4987 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4991 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4992 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4993 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4995 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4996 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4997 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5003 // calculate distance from nearplane
5004 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5005 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5006 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5007 if (clipdist[0] >= 0.0f)
5009 if (clipdist[1] >= 0.0f)
5011 if (clipdist[2] >= 0.0f)
5014 // triangle is entirely in front of nearplane
5015 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5022 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5030 if (clipdist[2] >= 0.0f)
5032 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5039 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5046 else if (clipdist[1] >= 0.0f)
5048 if (clipdist[2] >= 0.0f)
5050 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5057 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5063 else if (clipdist[2] >= 0.0f)
5065 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5070 else continue; // triangle is entirely behind nearplane
5073 // calculate integer y coords for triangle points
5074 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5075 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5076 screenmin = _mm_min_epi16(screeni, screenir),
5077 screenmax = _mm_max_epi16(screeni, screenir);
5078 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5079 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5080 screenmin = _mm_max_epi16(screenmin, fbmin);
5081 screenmax = _mm_min_epi16(screenmax, fbmax);
5082 // skip offscreen triangles
5083 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5085 starty = _mm_extract_epi16(screenmin, 1);
5086 endy = _mm_extract_epi16(screenmax, 1)+1;
5087 if (starty >= maxy1 && endy <= miny2)
5089 screeny = _mm_srai_epi32(screeni, 16);
5092 triangle = &thread->triangles[thread->numtriangles];
5094 // calculate attribute plans for triangle data...
5095 // okay, this triangle is going to produce spans, we'd better project
5096 // the interpolants now (this is what gives perspective texturing),
5097 // this consists of simply multiplying all arrays by the W coord
5098 // (which is basically 1/Z), which will be undone per-pixel
5099 // (multiplying by Z again) to get the perspective-correct array
5102 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5103 __m128 mipedgescale, mipdensity;
5104 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5105 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5106 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5107 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5108 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5109 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5110 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5111 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5112 attribedge1 = _mm_sub_ss(w0, w1);
5113 attribedge2 = _mm_sub_ss(w2, w1);
5114 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5115 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5116 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5117 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5118 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5119 _mm_store_ss(&triangle->w[0], attribxslope);
5120 _mm_store_ss(&triangle->w[1], attribyslope);
5121 _mm_store_ss(&triangle->w[2], attriborigin);
5126 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5128 float cliporigin, clipxslope, clipyslope;
5129 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5130 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5131 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5133 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5134 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5135 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5136 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5137 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5140 clip0origin = -cliporigin/clipxslope;
5141 clip0slope = -clipyslope/clipxslope;
5142 clip0dir = clipxslope > 0 ? 1 : -1;
5144 else if(clipyslope > 0)
5146 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5147 clip0slope = dpsoftrast.fb_width;
5150 else if(clipyslope < 0)
5152 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5153 clip0slope = -dpsoftrast.fb_width;
5156 else if(clip0origin < 0) continue;
5159 mipedgescale = _mm_setzero_ps();
5160 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5162 __m128 attrib0, attrib1, attrib2;
5163 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5164 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5166 arrays += numvertices*4;
5167 GENATTRIBS(attrib0, attrib1, attrib2);
5168 attriborigin = _mm_mul_ps(attrib1, w1);
5169 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5170 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5171 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5172 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5173 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5174 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5175 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5176 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5177 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5179 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5180 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5181 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5182 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5186 memset(triangle->mip, 0, sizeof(triangle->mip));
5187 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5189 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5190 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5192 texture = thread->texbound[texunit];
5193 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5195 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5196 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5197 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5198 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5199 // this will be multiplied in the texturing routine by the texture resolution
5200 y = _mm_cvtss_si32(mipdensity);
5203 y = (int)(log((float)y)*0.5f/M_LN2);
5204 if (y > texture->mipmaps - 1)
5205 y = texture->mipmaps - 1;
5206 triangle->mip[texunit] = y;
5212 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5215 __m128 xcoords, xslope;
5216 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5217 int yccmask = _mm_movemask_epi8(ycc);
5218 int edge0p, edge0n, edge1p, edge1n;
5227 case 0xFFFF: /*0000*/ y = endy; continue;
5228 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5229 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5230 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5231 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5232 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5233 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5234 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5235 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5236 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5237 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5238 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5239 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5240 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5241 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5242 case 0x0000: /*1111*/ y++; continue;
5250 case 0xFFFF: /*000*/ y = endy; continue;
5251 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5252 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5253 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5254 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5255 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5256 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5257 case 0x0000: /*111*/ y++; continue;
5260 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5261 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5262 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5263 nexty = _mm_extract_epi16(ycc, 0);
5264 if (nexty >= bandy) nexty = bandy-1;
5265 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5266 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5267 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5268 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5269 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5270 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5272 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5273 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5275 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5276 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5278 int startx, endx, offset;
5279 startx = _mm_cvtss_si32(xcoords);
5280 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5281 if (startx < minx) startx = minx;
5282 if (endx > maxx) endx = maxx;
5283 if (startx >= endx) continue;
5291 if(endx <= clip0) continue;
5292 startx = (int)clip0;
5295 else if (endx > clip0)
5297 if(startx >= clip0) continue;
5302 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5304 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5305 span->triangle = thread->numtriangles;
5309 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5310 if (span->startx >= span->endx)
5312 wslope = triangle->w[0];
5313 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5314 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5315 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5316 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5317 DPSOFTRAST_Draw_ProcessSpans(thread);
5322 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5324 DPSOFTRAST_Draw_ProcessSpans(thread);
5325 thread->numtriangles = 0;
5329 if (!ATOMIC_DECREMENT(command->refcount))
5331 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5332 MM_FREE(command->arrays);
5335 if (thread->numspans > 0 || thread->numtriangles > 0)
5337 DPSOFTRAST_Draw_ProcessSpans(thread);
5338 thread->numtriangles = 0;
5343 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5347 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5348 int datasize = 2*numvertices*sizeof(float[4]);
5349 DPSOFTRAST_Command_Draw *command;
5350 unsigned char *data;
5351 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5353 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5354 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5356 datasize += numvertices*sizeof(float[4]);
5359 datasize += numtriangles*sizeof(unsigned short[3]);
5361 datasize += numtriangles*sizeof(int[3]);
5362 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5363 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5365 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5366 data = (unsigned char *)MM_CALLOC(datasize, 1);
5370 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5371 data = (unsigned char *)command + commandsize;
5373 command->firstvertex = firstvertex;
5374 command->numvertices = numvertices;
5375 command->numtriangles = numtriangles;
5376 command->arrays = (float *)data;
5377 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5378 dpsoftrast.firstvertex = firstvertex;
5379 dpsoftrast.numvertices = numvertices;
5380 dpsoftrast.screencoord4f = (float *)data;
5381 data += numvertices*sizeof(float[4]);
5382 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5383 data += numvertices*sizeof(float[4]);
5384 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5386 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5387 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5389 dpsoftrast.post_array4f[j] = (float *)data;
5390 data += numvertices*sizeof(float[4]);
5392 command->element3i = NULL;
5393 command->element3s = NULL;
5396 command->element3s = (unsigned short *)data;
5397 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5401 command->element3i = (int *)data;
5402 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5407 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5409 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5410 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5411 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5412 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5413 if (command->starty >= command->endy)
5415 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5416 MM_FREE(command->arrays);
5417 DPSOFTRAST_UndoCommand(command->commandsize);
5420 command->clipped = dpsoftrast.drawclipped;
5421 command->refcount = dpsoftrast.numthreads;
5423 if (dpsoftrast.usethreads)
5426 DPSOFTRAST_Draw_SyncCommands();
5427 for (i = 0; i < dpsoftrast.numthreads; i++)
5429 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5430 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5431 Thread_CondSignal(thread->drawcond);
5436 DPSOFTRAST_Draw_FlushThreads();
5440 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5441 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5443 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5445 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5447 DPSOFTRAST_Command_SetRenderTargets *command;
5448 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5449 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5450 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5452 dpsoftrast.fb_width = width;
5453 dpsoftrast.fb_height = height;
5454 dpsoftrast.fb_depthpixels = depthpixels;
5455 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5456 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5457 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5458 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5459 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5460 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5461 command->width = width;
5462 command->height = height;
5465 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5467 int commandoffset = thread->commandoffset;
5468 while (commandoffset != endoffset)
5470 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5471 switch (command->opcode)
5473 #define INTERPCOMMAND(name) \
5474 case DPSOFTRAST_OPCODE_##name : \
5475 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5476 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5477 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5478 commandoffset = 0; \
5480 INTERPCOMMAND(Viewport)
5481 INTERPCOMMAND(ClearColor)
5482 INTERPCOMMAND(ClearDepth)
5483 INTERPCOMMAND(ColorMask)
5484 INTERPCOMMAND(DepthTest)
5485 INTERPCOMMAND(ScissorTest)
5486 INTERPCOMMAND(Scissor)
5487 INTERPCOMMAND(BlendFunc)
5488 INTERPCOMMAND(BlendSubtract)
5489 INTERPCOMMAND(DepthMask)
5490 INTERPCOMMAND(DepthFunc)
5491 INTERPCOMMAND(DepthRange)
5492 INTERPCOMMAND(PolygonOffset)
5493 INTERPCOMMAND(CullFace)
5494 INTERPCOMMAND(SetTexture)
5495 INTERPCOMMAND(SetShader)
5496 INTERPCOMMAND(Uniform4f)
5497 INTERPCOMMAND(UniformMatrix4f)
5498 INTERPCOMMAND(Uniform1i)
5499 INTERPCOMMAND(SetRenderTargets)
5500 INTERPCOMMAND(ClipPlane)
5502 case DPSOFTRAST_OPCODE_Draw:
5503 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5504 commandoffset += command->commandsize;
5505 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5507 thread->commandoffset = commandoffset;
5510 case DPSOFTRAST_OPCODE_Reset:
5515 thread->commandoffset = commandoffset;
5518 static int DPSOFTRAST_Draw_Thread(void *data)
5520 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5521 while(thread->index >= 0)
5523 if (thread->commandoffset != dpsoftrast.drawcommand)
5525 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5529 Thread_LockMutex(thread->drawmutex);
5530 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5532 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5533 thread->starving = true;
5534 Thread_CondWait(thread->drawcond, thread->drawmutex);
5535 thread->starving = false;
5537 Thread_UnlockMutex(thread->drawmutex);
5543 static void DPSOFTRAST_Draw_FlushThreads(void)
5545 DPSOFTRAST_State_Thread *thread;
5547 DPSOFTRAST_Draw_SyncCommands();
5548 if (dpsoftrast.usethreads)
5550 for (i = 0; i < dpsoftrast.numthreads; i++)
5552 thread = &dpsoftrast.threads[i];
5553 if (thread->commandoffset != dpsoftrast.drawcommand)
5555 Thread_LockMutex(thread->drawmutex);
5556 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5557 Thread_CondSignal(thread->drawcond);
5558 Thread_UnlockMutex(thread->drawmutex);
5561 for (i = 0; i < dpsoftrast.numthreads; i++)
5563 thread = &dpsoftrast.threads[i];
5564 if (thread->commandoffset != dpsoftrast.drawcommand)
5566 Thread_LockMutex(thread->drawmutex);
5567 if (thread->commandoffset != dpsoftrast.drawcommand)
5569 thread->waiting = true;
5570 Thread_CondWait(thread->waitcond, thread->drawmutex);
5571 thread->waiting = false;
5573 Thread_UnlockMutex(thread->drawmutex);
5579 for (i = 0; i < dpsoftrast.numthreads; i++)
5581 thread = &dpsoftrast.threads[i];
5582 if (thread->commandoffset != dpsoftrast.drawcommand)
5583 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5586 dpsoftrast.commandpool.usedcommands = 0;
5589 void DPSOFTRAST_Flush(void)
5591 DPSOFTRAST_Draw_FlushThreads();
5594 void DPSOFTRAST_Finish(void)
5599 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5609 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5610 dpsoftrast.bigendian = u.b[3];
5611 dpsoftrast.fb_width = width;
5612 dpsoftrast.fb_height = height;
5613 dpsoftrast.fb_depthpixels = depthpixels;
5614 dpsoftrast.fb_colorpixels[0] = colorpixels;
5615 dpsoftrast.fb_colorpixels[1] = NULL;
5616 dpsoftrast.fb_colorpixels[1] = NULL;
5617 dpsoftrast.fb_colorpixels[1] = NULL;
5618 dpsoftrast.viewport[0] = 0;
5619 dpsoftrast.viewport[1] = 0;
5620 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5621 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5622 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5623 dpsoftrast.texture_firstfree = 1;
5624 dpsoftrast.texture_end = 1;
5625 dpsoftrast.texture_max = 0;
5626 dpsoftrast.color[0] = 1;
5627 dpsoftrast.color[1] = 1;
5628 dpsoftrast.color[2] = 1;
5629 dpsoftrast.color[3] = 1;
5630 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5631 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5632 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5633 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5634 for (i = 0; i < dpsoftrast.numthreads; i++)
5636 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5638 thread->cullface = GL_BACK;
5639 thread->colormask[0] = 1;
5640 thread->colormask[1] = 1;
5641 thread->colormask[2] = 1;
5642 thread->colormask[3] = 1;
5643 thread->blendfunc[0] = GL_ONE;
5644 thread->blendfunc[1] = GL_ZERO;
5645 thread->depthmask = true;
5646 thread->depthtest = true;
5647 thread->depthfunc = GL_LEQUAL;
5648 thread->scissortest = false;
5649 thread->viewport[0] = 0;
5650 thread->viewport[1] = 0;
5651 thread->viewport[2] = dpsoftrast.fb_width;
5652 thread->viewport[3] = dpsoftrast.fb_height;
5653 thread->scissor[0] = 0;
5654 thread->scissor[1] = 0;
5655 thread->scissor[2] = dpsoftrast.fb_width;
5656 thread->scissor[3] = dpsoftrast.fb_height;
5657 thread->depthrange[0] = 0;
5658 thread->depthrange[1] = 1;
5659 thread->polygonoffset[0] = 0;
5660 thread->polygonoffset[1] = 0;
5661 thread->clipplane[0] = 0;
5662 thread->clipplane[1] = 0;
5663 thread->clipplane[2] = 0;
5664 thread->clipplane[3] = 1;
5666 thread->numspans = 0;
5667 thread->numtriangles = 0;
5668 thread->commandoffset = 0;
5669 thread->waiting = false;
5670 thread->starving = false;
5672 thread->validate = -1;
5673 DPSOFTRAST_Validate(thread, -1);
5675 if (dpsoftrast.usethreads)
5677 thread->waitcond = Thread_CreateCond();
5678 thread->drawcond = Thread_CreateCond();
5679 thread->drawmutex = Thread_CreateMutex();
5680 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5686 void DPSOFTRAST_Shutdown(void)
5689 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5691 DPSOFTRAST_State_Thread *thread;
5692 for (i = 0; i < dpsoftrast.numthreads; i++)
5694 thread = &dpsoftrast.threads[i];
5695 Thread_LockMutex(thread->drawmutex);
5697 Thread_CondSignal(thread->drawcond);
5698 Thread_UnlockMutex(thread->drawmutex);
5699 Thread_WaitThread(thread->thread, 0);
5700 Thread_DestroyCond(thread->waitcond);
5701 Thread_DestroyCond(thread->drawcond);
5702 Thread_DestroyMutex(thread->drawmutex);
5705 for (i = 0;i < dpsoftrast.texture_end;i++)
5706 if (dpsoftrast.texture[i].bytes)
5707 MM_FREE(dpsoftrast.texture[i].bytes);
5708 if (dpsoftrast.texture)
5709 free(dpsoftrast.texture);
5710 if (dpsoftrast.threads)
5711 MM_FREE(dpsoftrast.threads);
5712 memset(&dpsoftrast, 0, sizeof(dpsoftrast));