3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
622 s = w * h * d * sides * 4;
623 texture->mipmap[mipmaps][0] = size;
624 texture->mipmap[mipmaps][1] = s;
625 texture->mipmap[mipmaps][2] = w;
626 texture->mipmap[mipmaps][3] = h;
627 texture->mipmap[mipmaps][4] = d;
630 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
636 texture->mipmaps = mipmaps;
637 texture->size = size;
639 // allocate the pixels now
640 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644 void DPSOFTRAST_Texture_Free(int index)
646 DPSOFTRAST_Texture *texture;
647 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651 MM_FREE(texture->bytes);
652 texture->bytes = NULL;
653 memset(texture, 0, sizeof(*texture));
654 // adjust the free range and used range
655 if (dpsoftrast.texture_firstfree > index)
656 dpsoftrast.texture_firstfree = index;
657 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658 dpsoftrast.texture_end--;
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
662 int i, x, y, z, w, layer0, layer1, row0, row1;
663 unsigned char *o, *i0, *i1, *i2, *i3;
664 DPSOFTRAST_Texture *texture;
665 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666 if (texture->mipmaps <= 1)
668 for (i = 1;i < texture->mipmaps;i++)
670 for (z = 0;z < texture->mipmap[i][4];z++)
674 if (layer1 >= texture->mipmap[i-1][4])
675 layer1 = texture->mipmap[i-1][4]-1;
676 for (y = 0;y < texture->mipmap[i][3];y++)
680 if (row1 >= texture->mipmap[i-1][3])
681 row1 = texture->mipmap[i-1][3]-1;
682 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
683 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687 w = texture->mipmap[i][2];
690 if (texture->mipmap[i-1][2] > 1)
692 // average 3D texture
693 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
695 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
703 // average 3D mipmap with parent width == 1
704 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
706 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
715 if (texture->mipmap[i-1][2] > 1)
717 // average 2D texture (common case)
718 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
720 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
728 // 2D texture with parent width == 1
729 o[0] = (i0[0] + i1[0] + 1) >> 1;
730 o[1] = (i0[1] + i1[1] + 1) >> 1;
731 o[2] = (i0[2] + i1[2] + 1) >> 1;
732 o[3] = (i0[3] + i1[3] + 1) >> 1;
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
741 DPSOFTRAST_Texture *texture;
743 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
748 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749 while (blockheight > 0)
751 dst -= texture->mipmap[0][2] * 4;
752 memcpy(dst, pixels, blockwidth * 4);
753 pixels += blockwidth * 4;
757 DPSOFTRAST_Texture_CalculateMipmaps(index);
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
761 DPSOFTRAST_Texture *texture;
762 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
767 int i, stride = texture->mipmap[0][2]*4;
768 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769 for (i = texture->mipmap[0][3];i > 0;i--)
772 memcpy(dst, pixels, stride);
776 DPSOFTRAST_Texture_CalculateMipmaps(index);
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
780 DPSOFTRAST_Texture *texture;
781 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782 return texture->mipmap[mip][2];
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
786 DPSOFTRAST_Texture *texture;
787 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788 return texture->mipmap[mip][3];
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
792 DPSOFTRAST_Texture *texture;
793 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794 return texture->mipmap[mip][4];
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
798 DPSOFTRAST_Texture *texture;
799 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
802 return texture->bytes + texture->mipmap[mip][0];
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
806 DPSOFTRAST_Texture *texture;
807 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
810 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
815 texture->filter = filter;
818 static void DPSOFTRAST_Draw_FlushThreads(void);
820 static void DPSOFTRAST_Draw_SyncCommands(void)
822 if(dpsoftrast.usethreads) MEMORY_BARRIER;
823 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
828 DPSOFTRAST_State_Thread *thread;
830 int freecommand = dpsoftrast.commandpool.freecommand;
831 int usedcommands = dpsoftrast.commandpool.usedcommands;
832 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
834 DPSOFTRAST_Draw_SyncCommands();
840 for (i = 0; i < dpsoftrast.numthreads; i++)
842 thread = &dpsoftrast.threads[i];
843 commandoffset = freecommand - thread->commandoffset;
844 if (commandoffset < 0)
845 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846 if (commandoffset > usedcommands)
849 usedcommands = commandoffset;
852 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
854 thread = &dpsoftrast.threads[waitindex];
855 Thread_LockMutex(thread->drawmutex);
856 if (thread->commandoffset != dpsoftrast.drawcommand)
858 thread->waiting = true;
859 if (thread->starving) Thread_CondSignal(thread->drawcond);
860 Thread_CondWait(thread->waitcond, thread->drawmutex);
861 thread->waiting = false;
863 Thread_UnlockMutex(thread->drawmutex);
865 dpsoftrast.commandpool.usedcommands = usedcommands;
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
875 DPSOFTRAST_Command *command;
876 int freecommand = dpsoftrast.commandpool.freecommand;
877 int usedcommands = dpsoftrast.commandpool.usedcommands;
878 int extra = sizeof(DPSOFTRAST_Command);
879 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
883 if (dpsoftrast.usethreads)
884 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
886 DPSOFTRAST_Draw_FlushThreads();
887 freecommand = dpsoftrast.commandpool.freecommand;
888 usedcommands = dpsoftrast.commandpool.usedcommands;
890 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
892 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893 command->opcode = DPSOFTRAST_OPCODE_Reset;
894 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
897 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898 command->opcode = opcode;
899 command->commandsize = size;
901 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
903 dpsoftrast.commandpool.freecommand = freecommand;
904 dpsoftrast.commandpool.usedcommands = usedcommands + size;
908 static void DPSOFTRAST_UndoCommand(int size)
910 int freecommand = dpsoftrast.commandpool.freecommand;
911 int usedcommands = dpsoftrast.commandpool.usedcommands;
914 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915 usedcommands -= size;
916 dpsoftrast.commandpool.freecommand = freecommand;
917 dpsoftrast.commandpool.usedcommands = usedcommands;
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
923 thread->viewport[0] = command->x;
924 thread->viewport[1] = command->y;
925 thread->viewport[2] = command->width;
926 thread->viewport[3] = command->height;
927 thread->validate |= DPSOFTRAST_VALIDATE_FB;
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
931 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
934 command->width = width;
935 command->height = height;
937 dpsoftrast.viewport[0] = x;
938 dpsoftrast.viewport[1] = y;
939 dpsoftrast.viewport[2] = width;
940 dpsoftrast.viewport[3] = height;
941 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
947 int i, x1, y1, x2, y2, w, h, x, y;
948 int miny1, maxy1, miny2, maxy2;
952 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953 miny1 = thread->miny1;
954 maxy1 = thread->maxy1;
955 miny2 = thread->miny2;
956 maxy2 = thread->maxy2;
957 x1 = thread->fb_scissor[0];
958 y1 = thread->fb_scissor[1];
959 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961 if (y1 < miny1) y1 = miny1;
962 if (y2 > maxy2) y2 = maxy2;
967 // FIXME: honor fb_colormask?
968 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969 for (i = 0;i < 4;i++)
971 if (!dpsoftrast.fb_colorpixels[i])
973 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
976 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977 for (x = x1;x < x2;x++)
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
984 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
994 int x1, y1, x2, y2, w, h, x, y;
995 int miny1, maxy1, miny2, maxy2;
999 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000 miny1 = thread->miny1;
1001 maxy1 = thread->maxy1;
1002 miny2 = thread->miny2;
1003 maxy2 = thread->maxy2;
1004 x1 = thread->fb_scissor[0];
1005 y1 = thread->fb_scissor[1];
1006 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008 if (y1 < miny1) y1 = miny1;
1009 if (y2 > maxy2) y2 = maxy2;
1014 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016 for (;y < bandy;y++)
1018 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019 for (x = x1;x < x2;x++)
1023 void DPSOFTRAST_ClearDepth(float d)
1025 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1032 thread->colormask[0] = command->r != 0;
1033 thread->colormask[1] = command->g != 0;
1034 thread->colormask[2] = command->b != 0;
1035 thread->colormask[3] = command->a != 0;
1036 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1040 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1050 thread->depthtest = command->enable;
1051 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1053 void DPSOFTRAST_DepthTest(int enable)
1055 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056 command->enable = enable;
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1062 thread->scissortest = command->enable;
1063 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1065 void DPSOFTRAST_ScissorTest(int enable)
1067 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068 command->enable = enable;
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1074 thread->scissor[0] = command->x;
1075 thread->scissor[1] = command->y;
1076 thread->scissor[2] = command->width;
1077 thread->scissor[3] = command->height;
1078 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1082 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1085 command->width = width;
1086 command->height = height;
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1092 thread->blendfunc[0] = command->sfactor;
1093 thread->blendfunc[1] = command->dfactor;
1094 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1098 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099 command->sfactor = sfactor;
1100 command->dfactor = dfactor;
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1106 thread->blendsubtract = command->enable;
1107 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1109 void DPSOFTRAST_BlendSubtract(int enable)
1111 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112 command->enable = enable;
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1118 thread->depthmask = command->enable;
1120 void DPSOFTRAST_DepthMask(int enable)
1122 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123 command->enable = enable;
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1129 thread->depthfunc = command->func;
1131 void DPSOFTRAST_DepthFunc(int func)
1133 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134 command->func = func;
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1140 thread->depthrange[0] = command->nearval;
1141 thread->depthrange[1] = command->farval;
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1145 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146 command->nearval = nearval;
1147 command->farval = farval;
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1153 thread->polygonoffset[0] = command->alongnormal;
1154 thread->polygonoffset[1] = command->intoview;
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1158 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159 command->alongnormal = alongnormal;
1160 command->intoview = intoview;
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1166 thread->cullface = command->mode;
1168 void DPSOFTRAST_CullFace(int mode)
1170 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171 command->mode = mode;
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1176 dpsoftrast.color[0] = r;
1177 dpsoftrast.color[1] = g;
1178 dpsoftrast.color[2] = b;
1179 dpsoftrast.color[3] = a;
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1184 int outstride = blockwidth * 4;
1185 int instride = dpsoftrast.fb_width * 4;
1188 int bx2 = blockx + blockwidth;
1189 int by2 = blocky + blockheight;
1193 unsigned char *inpixels;
1197 if (bx1 < 0) bx1 = 0;
1198 if (by1 < 0) by1 = 0;
1199 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1202 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203 if (dpsoftrast.bigendian)
1205 for (y = by1;y < by2;y++)
1207 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208 o = (unsigned char *)outpixels + (y - by1) * outstride;
1209 for (x = bx1;x < bx2;x++)
1222 for (y = by1;y < by2;y++)
1224 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225 o = (unsigned char *)outpixels + (y - by1) * outstride;
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 int tx2 = tx + width;
1236 int ty2 = ty + height;
1239 int sx2 = sx + width;
1240 int sy2 = sy + height;
1250 unsigned int *spixels;
1251 unsigned int *tpixels;
1252 DPSOFTRAST_Texture *texture;
1253 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254 if (mip < 0 || mip >= texture->mipmaps) return;
1256 spixels = dpsoftrast.fb_colorpixels[0];
1257 swidth = dpsoftrast.fb_width;
1258 sheight = dpsoftrast.fb_height;
1259 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260 twidth = texture->mipmap[mip][2];
1261 theight = texture->mipmap[mip][3];
1262 if (tx1 < 0) tx1 = 0;
1263 if (ty1 < 0) ty1 = 0;
1264 if (tx2 > twidth) tx2 = twidth;
1265 if (ty2 > theight) ty2 = theight;
1266 if (sx1 < 0) sx1 = 0;
1267 if (sy1 < 0) sy1 = 0;
1268 if (sx2 > swidth) sx2 = swidth;
1269 if (sy2 > sheight) sy2 = sheight;
1274 if (tw > sw) tw = sw;
1275 if (th > sh) th = sh;
1276 if (tw < 1 || th < 1)
1278 sy1 = sheight - sy1 - th;
1279 ty1 = theight - ty1 - th;
1280 for (y = 0;y < th;y++)
1281 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282 if (texture->mipmaps > 1)
1283 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1289 if (thread->texbound[command->unitnum])
1290 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291 thread->texbound[command->unitnum] = command->texture;
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1295 DPSOFTRAST_Command_SetTexture *command;
1296 DPSOFTRAST_Texture *texture;
1297 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1299 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302 texture = DPSOFTRAST_Texture_GetByIndex(index);
1303 if (index && !texture)
1305 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310 command->unitnum = unitnum;
1311 command->texture = texture;
1313 dpsoftrast.texbound[unitnum] = texture;
1315 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 dpsoftrast.pointer_vertex3f = vertex3f;
1321 dpsoftrast.stride_vertex = stride;
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 dpsoftrast.pointer_color4f = color4f;
1326 dpsoftrast.pointer_color4ub = NULL;
1327 dpsoftrast.stride_color = stride;
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 dpsoftrast.pointer_color4f = NULL;
1332 dpsoftrast.pointer_color4ub = color4ub;
1333 dpsoftrast.stride_color = stride;
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339 dpsoftrast.stride_texcoord[unitnum] = stride;
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 thread->shader_mode = command->mode;
1346 thread->shader_permutation = command->permutation;
1347 thread->shader_exactspecularmath = command->exactspecularmath;
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352 command->mode = mode;
1353 command->permutation = permutation;
1354 command->exactspecularmath = exactspecularmath;
1356 dpsoftrast.shader_mode = mode;
1357 dpsoftrast.shader_permutation = permutation;
1358 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369 command->index = index;
1370 command->val[0] = v0;
1371 command->val[1] = v1;
1372 command->val[2] = v2;
1373 command->val[3] = v3;
1375 dpsoftrast.uniform4f[index*4+0] = v0;
1376 dpsoftrast.uniform4f[index*4+1] = v1;
1377 dpsoftrast.uniform4f[index*4+2] = v2;
1378 dpsoftrast.uniform4f[index*4+3] = v3;
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383 command->index = index;
1384 memcpy(command->val, v, sizeof(command->val));
1386 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1398 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400 __m128 m0, m1, m2, m3;
1401 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402 command->index = (DPSOFTRAST_UNIFORM)index;
1403 if (((size_t)v)&(ALIGN_SIZE-1))
1405 m0 = _mm_loadu_ps(v);
1406 m1 = _mm_loadu_ps(v+4);
1407 m2 = _mm_loadu_ps(v+8);
1408 m3 = _mm_loadu_ps(v+12);
1412 m0 = _mm_load_ps(v);
1413 m1 = _mm_load_ps(v+4);
1414 m2 = _mm_load_ps(v+8);
1415 m3 = _mm_load_ps(v+12);
1419 __m128 t0, t1, t2, t3;
1420 t0 = _mm_unpacklo_ps(m0, m1);
1421 t1 = _mm_unpacklo_ps(m2, m3);
1422 t2 = _mm_unpackhi_ps(m0, m1);
1423 t3 = _mm_unpackhi_ps(m2, m3);
1424 m0 = _mm_movelh_ps(t0, t1);
1425 m1 = _mm_movehl_ps(t1, t0);
1426 m2 = _mm_movelh_ps(t2, t3);
1427 m3 = _mm_movehl_ps(t3, t2);
1429 _mm_store_ps(command->val, m0);
1430 _mm_store_ps(command->val+4, m1);
1431 _mm_store_ps(command->val+8, m2);
1432 _mm_store_ps(command->val+12, m3);
1433 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 thread->uniform1i[command->index] = command->val;
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449 command->index = index;
1452 dpsoftrast.uniform1i[command->index] = i0;
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464 command->clipplane[0] = x;
1465 command->clipplane[1] = y;
1466 command->clipplane[2] = z;
1467 command->clipplane[3] = w;
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 float *end = dst + size*4;
1474 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1478 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1487 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 float *end = dst + size*4;
1497 if (stride == sizeof(float[3]))
1499 float *end4 = dst + (size&~3)*4;
1500 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1504 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1505 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518 src += 4*sizeof(float[3]);
1525 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539 src += 4*sizeof(float[3]);
1543 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1547 __m128 v = _mm_loadu_ps((const float *)src);
1548 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551 _mm_store_ps(dst, v);
1560 __m128 v = _mm_load_ps((const float *)src);
1561 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564 _mm_store_ps(dst, v);
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 float *end = dst + size*4;
1574 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575 if (stride == sizeof(float[2]))
1577 float *end2 = dst + (size&~1)*4;
1578 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1582 __m128 v = _mm_loadu_ps((const float *)src);
1583 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586 src += 2*sizeof(float[2]);
1593 __m128 v = _mm_load_ps((const float *)src);
1594 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597 src += 2*sizeof(float[2]);
1603 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 float *end = dst + size*4;
1612 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613 if (stride == sizeof(unsigned char[4]))
1615 float *end4 = dst + (size&~3)*4;
1616 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1620 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626 src += 4*sizeof(unsigned char[4]);
1633 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639 src += 4*sizeof(unsigned char[4]);
1645 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 float *end = dst + 4*size;
1655 __m128 v = _mm_loadu_ps(src);
1658 _mm_store_ps(dst, v);
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1667 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668 __m128 m0, m1, m2, m3;
1670 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672 // fast case for identity matrix
1673 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1676 end = out4f + numitems*4;
1677 m0 = _mm_loadu_ps(inmatrix16f);
1678 m1 = _mm_loadu_ps(inmatrix16f + 4);
1679 m2 = _mm_loadu_ps(inmatrix16f + 8);
1680 m3 = _mm_loadu_ps(inmatrix16f + 12);
1681 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1685 __m128 v = _mm_loadu_ps(in4f);
1687 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1699 __m128 v = _mm_load_ps(in4f);
1701 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1722 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1731 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1739 int clipmask = 0xFF;
1740 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748 #define BBFRONT(k, pos) \
1750 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1755 clipmask &= ~(1<<k); \
1756 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757 minproj = _mm_min_ss(minproj, proj); \
1758 maxproj = _mm_max_ss(maxproj, proj); \
1762 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1763 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1764 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1766 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1767 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1771 if (clipmask&(1<<k)) \
1773 if (!(clipmask&(1<<(k^1)))) \
1775 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778 minproj = _mm_min_ss(minproj, proj); \
1779 maxproj = _mm_max_ss(maxproj, proj); \
1781 if (!(clipmask&(1<<(k^2)))) \
1783 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786 minproj = _mm_min_ss(minproj, proj); \
1787 maxproj = _mm_max_ss(maxproj, proj); \
1789 if (!(clipmask&(1<<(k^4)))) \
1791 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794 minproj = _mm_min_ss(minproj, proj); \
1795 maxproj = _mm_max_ss(maxproj, proj); \
1799 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806 *starty = _mm_cvttss_si32(maxproj);
1807 *endy = _mm_cvttss_si32(minproj)+1;
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1813 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814 float *end = out4f + numitems*4;
1815 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816 __m128 minpos, maxpos;
1817 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819 minpos = maxpos = _mm_loadu_ps(in4f);
1822 __m128 v = _mm_loadu_ps(in4f);
1823 minpos = _mm_min_ps(minpos, v);
1824 maxpos = _mm_max_ps(maxpos, v);
1825 _mm_store_ps(out4f, v);
1826 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827 _mm_store_ps(screen4f, v);
1835 minpos = maxpos = _mm_load_ps(in4f);
1838 __m128 v = _mm_load_ps(in4f);
1839 minpos = _mm_min_ps(minpos, v);
1840 maxpos = _mm_max_ps(maxpos, v);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1851 ALIGN(float minposf[4]);
1852 ALIGN(float maxposf[4]);
1853 _mm_store_ps(minposf, minpos);
1854 _mm_store_ps(maxposf, maxpos);
1855 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1862 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1865 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867 end = out4f + numitems*4;
1868 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870 m0 = _mm_loadu_ps(inmatrix16f);
1871 m1 = _mm_loadu_ps(inmatrix16f + 4);
1872 m2 = _mm_loadu_ps(inmatrix16f + 8);
1873 m3 = _mm_loadu_ps(inmatrix16f + 12);
1874 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1876 minpos = maxpos = _mm_loadu_ps(in4f);
1879 __m128 v = _mm_loadu_ps(in4f);
1880 minpos = _mm_min_ps(minpos, v);
1881 maxpos = _mm_max_ps(maxpos, v);
1882 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883 _mm_store_ps(out4f, v);
1884 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885 _mm_store_ps(screen4f, v);
1893 minpos = maxpos = _mm_load_ps(in4f);
1896 __m128 v = _mm_load_ps(in4f);
1897 minpos = _mm_min_ps(minpos, v);
1898 maxpos = _mm_max_ps(maxpos, v);
1899 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900 _mm_store_ps(out4f, v);
1901 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902 _mm_store_ps(screen4f, v);
1910 ALIGN(float minposf[4]);
1911 ALIGN(float maxposf[4]);
1912 _mm_store_ps(minposf, minpos);
1913 _mm_store_ps(maxposf, maxpos);
1914 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1923 float *outf = dpsoftrast.post_array4f[outarray];
1924 const unsigned char *inb;
1925 int firstvertex = dpsoftrast.firstvertex;
1926 int numvertices = dpsoftrast.numvertices;
1930 case DPSOFTRAST_ARRAY_POSITION:
1931 stride = dpsoftrast.stride_vertex;
1932 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935 case DPSOFTRAST_ARRAY_COLOR:
1936 stride = dpsoftrast.stride_color;
1937 if (dpsoftrast.pointer_color4f)
1939 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942 else if (dpsoftrast.pointer_color4ub)
1944 stride = dpsoftrast.stride_color;
1945 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1950 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1954 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1961 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1964 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1967 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1981 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1990 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2002 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2010 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2013 int startx = span->startx;
2014 int endx = span->endx;
2015 float wslope = triangle->w[0];
2016 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017 float endz = 1.0f / (w + wslope * startx);
2018 if (triangle->w[0] == 0)
2020 // LordHavoc: fast flat polygons (HUD/menu)
2021 for (x = startx;x < endx;x++)
2025 for (x = startx;x < endx;)
2027 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2029 if (nextsub >= endx) nextsub = endsub = endx-1;
2030 endz = 1.0f / (w + wslope * nextsub);
2031 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032 for (; x <= endsub; x++, z += dz)
2037 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2041 int startx = span->startx;
2042 int endx = span->endx;
2045 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046 unsigned char * RESTRICT pixelmask = span->pixelmask;
2047 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2050 pixeli += span->y * dpsoftrast.fb_width + span->x;
2051 // handle alphatest now (this affects depth writes too)
2052 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053 for (x = startx;x < endx;x++)
2054 if (in4ub[x*4+3] < 128)
2055 pixelmask[x] = false;
2056 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057 // helps sprites, text and hud artwork
2058 switch(thread->fb_blendmode)
2060 case DPSOFTRAST_BLENDMODE_ALPHA:
2061 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2064 for (x = startx;x < endx;x++)
2066 if (in4ub[x*4+3] >= 1)
2071 while (++x < endx && in4ub[x*4+3] >= 1) ;
2073 if (x >= endx) break;
2075 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076 if (x >= endx) break;
2083 case DPSOFTRAST_BLENDMODE_OPAQUE:
2084 case DPSOFTRAST_BLENDMODE_ADD:
2085 case DPSOFTRAST_BLENDMODE_INVMOD:
2086 case DPSOFTRAST_BLENDMODE_MUL:
2087 case DPSOFTRAST_BLENDMODE_MUL2:
2088 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089 case DPSOFTRAST_BLENDMODE_INVADD:
2092 // put some special values at the end of the mask to ensure the loops end
2093 pixelmask[endx] = 1;
2094 pixelmask[endx+1] = 0;
2095 // LordHavoc: use a double loop to identify subspans, this helps the
2096 // optimized copy/blend loops to perform at their best, most triangles
2097 // have only one run of pixels, and do the search using wide reads...
2101 // if this pixel is masked off, it's probably not alone...
2108 // the 4-item search must be aligned or else it stalls badly
2109 if ((x & 3) && !pixelmask[x])
2111 if(pixelmask[x]) goto endmasked;
2115 if(pixelmask[x]) goto endmasked;
2119 if(pixelmask[x]) goto endmasked;
2124 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2128 for (;!pixelmask[x];x++)
2130 // rather than continue the loop, just check the end variable
2135 // find length of subspan
2138 if (subx + 8 < endx)
2142 if(!pixelmask[subx]) goto endunmasked;
2146 if(!pixelmask[subx]) goto endunmasked;
2150 if(!pixelmask[subx]) goto endunmasked;
2155 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2159 for (;pixelmask[subx];subx++)
2161 // the checks can overshoot, so make sure to clip it...
2165 // now that we know the subspan length... process!
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2172 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2177 while (x + 16 <= subx)
2179 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2187 while (x + 4 <= subx)
2189 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195 pixeli[x+1] = ini[x+1];
2205 case DPSOFTRAST_BLENDMODE_ALPHA:
2206 #define FINISHBLEND(blend2, blend1) \
2207 for (;x + 1 < subx;x += 2) \
2210 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2213 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2218 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2221 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2228 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241 case DPSOFTRAST_BLENDMODE_ADD:
2242 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2244 case DPSOFTRAST_BLENDMODE_INVMOD:
2246 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 case DPSOFTRAST_BLENDMODE_MUL:
2252 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2254 case DPSOFTRAST_BLENDMODE_MUL2:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2257 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2259 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2262 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275 case DPSOFTRAST_BLENDMODE_INVADD:
2277 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288 // warning: this is SLOW, only use if the optimized per-span functions won't do
2290 const unsigned char * RESTRICT pixelbase;
2291 const unsigned char * RESTRICT pixel[4];
2292 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293 int wrapmask[2] = { width-1, height-1 };
2294 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2295 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2297 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2298 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2302 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2303 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2305 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2312 tci[0] &= wrapmask[0];
2313 tci[1] &= wrapmask[1];
2314 tci1[0] &= wrapmask[0];
2315 tci1[1] &= wrapmask[1];
2317 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2318 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2319 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2320 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2321 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2328 int tci[2] = { x * width, y * height };
2329 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2331 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2336 tci[0] &= wrapmask[0];
2337 tci[1] &= wrapmask[1];
2339 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2348 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2351 int startx = span->startx;
2352 int endx = span->endx;
2357 float tc[2], endtc[2];
2359 unsigned int tci[2];
2360 unsigned int tci1[2];
2361 unsigned int tcimin[2];
2362 unsigned int tcimax[2];
2367 const unsigned char * RESTRICT pixelbase;
2368 const unsigned char * RESTRICT pixel[4];
2369 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2370 // if no texture is bound, just fill it with white
2373 for (x = startx;x < endx;x++)
2375 out4f[x*4+0] = 1.0f;
2376 out4f[x*4+1] = 1.0f;
2377 out4f[x*4+2] = 1.0f;
2378 out4f[x*4+3] = 1.0f;
2382 mip = triangle->mip[texunitindex];
2383 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2384 // if this mipmap of the texture is 1 pixel, just fill it with that color
2385 if (texture->mipmap[mip][1] == 4)
2387 c[0] = texture->bytes[2] * (1.0f/255.0f);
2388 c[1] = texture->bytes[1] * (1.0f/255.0f);
2389 c[2] = texture->bytes[0] * (1.0f/255.0f);
2390 c[3] = texture->bytes[3] * (1.0f/255.0f);
2391 for (x = startx;x < endx;x++)
2393 out4f[x*4+0] = c[0];
2394 out4f[x*4+1] = c[1];
2395 out4f[x*4+2] = c[2];
2396 out4f[x*4+3] = c[3];
2400 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2401 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2402 flags = texture->flags;
2403 tcscale[0] = texture->mipmap[mip][2];
2404 tcscale[1] = texture->mipmap[mip][3];
2405 tciwidth = -texture->mipmap[mip][2];
2408 tcimax[0] = texture->mipmap[mip][2]-1;
2409 tcimax[1] = texture->mipmap[mip][3]-1;
2410 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2411 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2412 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2413 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2419 for (x = startx;x < endx;)
2421 unsigned int subtc[2];
2422 unsigned int substep[2];
2423 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2424 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2425 if (nextsub >= endx)
2427 nextsub = endsub = endx-1;
2428 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2432 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2433 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2439 substep[0] = (endtc[0] - tc[0]) * subscale;
2440 substep[1] = (endtc[1] - tc[1]) * subscale;
2441 subtc[0] = tc[0] * (1<<12);
2442 subtc[1] = tc[1] * (1<<12);
2445 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2447 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2449 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2450 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2451 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2452 tci[0] = subtc[0]>>12;
2453 tci[1] = subtc[1]>>12;
2454 tci1[0] = tci[0] + 1;
2455 tci1[1] = tci[1] + 1;
2456 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2457 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2458 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2459 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2460 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2462 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2463 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2464 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2465 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2466 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2467 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2468 out4f[x*4+0] = c[0];
2469 out4f[x*4+1] = c[1];
2470 out4f[x*4+2] = c[2];
2471 out4f[x*4+3] = c[3];
2476 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2478 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2479 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2480 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2481 tci[0] = subtc[0]>>12;
2482 tci[1] = subtc[1]>>12;
2483 tci1[0] = tci[0] + 1;
2484 tci1[1] = tci[1] + 1;
2485 tci[0] &= tciwrapmask[0];
2486 tci[1] &= tciwrapmask[1];
2487 tci1[0] &= tciwrapmask[0];
2488 tci1[1] &= tciwrapmask[1];
2489 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2490 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2491 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2492 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2493 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2494 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2495 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2496 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2497 out4f[x*4+0] = c[0];
2498 out4f[x*4+1] = c[1];
2499 out4f[x*4+2] = c[2];
2500 out4f[x*4+3] = c[3];
2504 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2506 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2508 tci[0] = subtc[0]>>12;
2509 tci[1] = subtc[1]>>12;
2510 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2511 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2512 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2513 c[0] = pixel[0][2] * (1.0f / 255.0f);
2514 c[1] = pixel[0][1] * (1.0f / 255.0f);
2515 c[2] = pixel[0][0] * (1.0f / 255.0f);
2516 c[3] = pixel[0][3] * (1.0f / 255.0f);
2517 out4f[x*4+0] = c[0];
2518 out4f[x*4+1] = c[1];
2519 out4f[x*4+2] = c[2];
2520 out4f[x*4+3] = c[3];
2525 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2527 tci[0] = subtc[0]>>12;
2528 tci[1] = subtc[1]>>12;
2529 tci[0] &= tciwrapmask[0];
2530 tci[1] &= tciwrapmask[1];
2531 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532 c[0] = pixel[0][2] * (1.0f / 255.0f);
2533 c[1] = pixel[0][1] * (1.0f / 255.0f);
2534 c[2] = pixel[0][0] * (1.0f / 255.0f);
2535 c[3] = pixel[0][3] * (1.0f / 255.0f);
2536 out4f[x*4+0] = c[0];
2537 out4f[x*4+1] = c[1];
2538 out4f[x*4+2] = c[2];
2539 out4f[x*4+3] = c[3];
2546 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2550 int startx = span->startx;
2551 int endx = span->endx;
2553 __m128 data, slope, tcscale;
2554 __m128i tcsize, tcmask, tcoffset, tcmax;
2556 __m128i subtc, substep, endsubtc;
2559 int affine; // LordHavoc: optimized affine texturing case
2560 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2561 const unsigned char * RESTRICT pixelbase;
2562 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2563 // if no texture is bound, just fill it with white
2566 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2569 mip = triangle->mip[texunitindex];
2570 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2571 // if this mipmap of the texture is 1 pixel, just fill it with that color
2572 if (texture->mipmap[mip][1] == 4)
2574 unsigned int k = *((const unsigned int *)pixelbase);
2575 for (x = startx;x < endx;x++)
2579 affine = zf[startx] == zf[endx-1];
2580 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2581 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2582 flags = texture->flags;
2583 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2584 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2585 tcscale = _mm_cvtepi32_ps(tcsize);
2586 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2587 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2588 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2590 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2591 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2592 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2593 tcmax = _mm_packs_epi32(tcmask, tcmask);
2594 for (x = startx;x < endx;)
2596 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2597 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2598 if (nextsub >= endx || affine)
2600 nextsub = endsub = endx-1;
2601 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2605 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2607 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2608 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2609 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2610 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2611 substep = _mm_slli_epi32(substep, 1);
2614 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2615 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2617 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2618 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2620 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2621 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2622 tci = _mm_madd_epi16(tci, tcoffset);
2623 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2624 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2625 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2626 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2627 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2628 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2629 fracm = _mm_srli_epi16(subtc, 1);
2630 pix1 = _mm_add_epi16(pix1,
2631 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633 pix3 = _mm_add_epi16(pix3,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638 pix2 = _mm_add_epi16(pix2,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2645 const unsigned char * RESTRICT ptr1;
2646 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2647 tci = _mm_madd_epi16(tci, tcoffset);
2648 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2650 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2651 fracm = _mm_srli_epi16(subtc, 1);
2652 pix1 = _mm_add_epi16(pix1,
2653 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2654 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2655 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2656 pix1 = _mm_add_epi16(pix1,
2657 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2659 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2663 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2665 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2667 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2668 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2669 tci = _mm_madd_epi16(tci, tcoffset);
2670 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2671 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2672 _mm_setzero_si128());
2673 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2674 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2675 _mm_setzero_si128());
2676 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2677 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678 tci = _mm_madd_epi16(tci, tcoffset);
2679 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681 _mm_setzero_si128());
2682 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684 _mm_setzero_si128());
2685 fracm = _mm_srli_epi16(subtc, 1);
2686 pix1 = _mm_add_epi16(pix1,
2687 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689 pix3 = _mm_add_epi16(pix3,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2691 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2692 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2693 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2694 pix2 = _mm_add_epi16(pix2,
2695 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2696 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2697 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2701 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2702 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2703 tci = _mm_madd_epi16(tci, tcoffset);
2704 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2705 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2706 _mm_setzero_si128());
2707 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2708 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2709 _mm_setzero_si128());
2710 fracm = _mm_srli_epi16(subtc, 1);
2711 pix1 = _mm_add_epi16(pix1,
2712 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2715 pix1 = _mm_add_epi16(pix1,
2716 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2717 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2718 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2724 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2726 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2727 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2728 tci = _mm_madd_epi16(tci, tcoffset);
2729 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731 _mm_setzero_si128());
2732 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734 _mm_setzero_si128());
2735 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2736 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2737 tci = _mm_madd_epi16(tci, tcoffset);
2738 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740 _mm_setzero_si128());
2741 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743 _mm_setzero_si128());
2744 fracm = _mm_srli_epi16(subtc, 1);
2745 pix1 = _mm_add_epi16(pix1,
2746 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2747 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2748 pix3 = _mm_add_epi16(pix3,
2749 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2750 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2751 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2752 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2753 pix2 = _mm_add_epi16(pix2,
2754 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2755 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2756 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2760 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2761 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762 tci = _mm_madd_epi16(tci, tcoffset);
2763 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2764 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765 _mm_setzero_si128());
2766 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768 _mm_setzero_si128());
2769 fracm = _mm_srli_epi16(subtc, 1);
2770 pix1 = _mm_add_epi16(pix1,
2771 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2774 pix1 = _mm_add_epi16(pix1,
2775 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2776 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2777 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2784 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2786 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2788 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2790 tci = _mm_madd_epi16(tci, tcoffset);
2791 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2796 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798 tci = _mm_madd_epi16(tci, tcoffset);
2799 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2807 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2808 tci = _mm_and_si128(tci, tcmax);
2809 tci = _mm_madd_epi16(tci, tcoffset);
2810 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2815 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2816 tci = _mm_and_si128(tci, tcmax);
2817 tci = _mm_madd_epi16(tci, tcoffset);
2818 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2827 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2830 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2833 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2840 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2843 int startx = span->startx;
2844 int endx = span->endx;
2849 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2850 for (x = startx;x < endx;x++)
2853 c[0] = (data[0] + slope[0]*x) * z;
2854 c[1] = (data[1] + slope[1]*x) * z;
2855 c[2] = (data[2] + slope[2]*x) * z;
2856 c[3] = (data[3] + slope[3]*x) * z;
2857 out4f[x*4+0] = in4f[x*4+0] * c[0];
2858 out4f[x*4+1] = in4f[x*4+1] * c[1];
2859 out4f[x*4+2] = in4f[x*4+2] * c[2];
2860 out4f[x*4+3] = in4f[x*4+3] * c[3];
2866 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2869 int startx = span->startx;
2870 int endx = span->endx;
2875 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2876 for (x = startx;x < endx;x++)
2879 c[0] = (data[0] + slope[0]*x) * z;
2880 c[1] = (data[1] + slope[1]*x) * z;
2881 c[2] = (data[2] + slope[2]*x) * z;
2882 c[3] = (data[3] + slope[3]*x) * z;
2883 out4f[x*4+0] = c[0];
2884 out4f[x*4+1] = c[1];
2885 out4f[x*4+2] = c[2];
2886 out4f[x*4+3] = c[3];
2892 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2894 int x, startx = span->startx, endx = span->endx;
2895 float c[4], localcolor[4];
2896 localcolor[0] = subcolor[0];
2897 localcolor[1] = subcolor[1];
2898 localcolor[2] = subcolor[2];
2899 localcolor[3] = subcolor[3];
2900 for (x = startx;x < endx;x++)
2902 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2903 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2904 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2905 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2906 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2907 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2908 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2909 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2915 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2917 int x, startx = span->startx, endx = span->endx;
2918 for (x = startx;x < endx;x++)
2920 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2921 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2922 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2923 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2929 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2931 int x, startx = span->startx, endx = span->endx;
2932 for (x = startx;x < endx;x++)
2934 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2935 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2936 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2937 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2943 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2945 int x, startx = span->startx, endx = span->endx;
2947 for (x = startx;x < endx;x++)
2949 a = 1.0f - inb4f[x*4+3];
2951 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2952 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2953 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2954 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2960 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2962 int x, startx = span->startx, endx = span->endx;
2963 float localcolor[4], ilerp, lerp;
2964 localcolor[0] = color[0];
2965 localcolor[1] = color[1];
2966 localcolor[2] = color[2];
2967 localcolor[3] = color[3];
2968 ilerp = 1.0f - localcolor[3];
2969 lerp = localcolor[3];
2970 for (x = startx;x < endx;x++)
2972 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2973 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2974 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2975 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2982 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2986 int startx = span->startx;
2987 int endx = span->endx;
2990 __m128i submod, substep, endsubmod;
2991 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2992 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2993 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2994 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2995 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2996 for (x = startx; x < endx;)
2998 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2999 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3000 if (nextsub >= endx)
3002 nextsub = endsub = endx-1;
3003 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3007 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3008 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3009 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3010 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3011 substep = _mm_packs_epi32(substep, substep);
3012 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3014 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3015 pix = _mm_mulhi_epu16(pix, submod);
3016 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3020 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3021 pix = _mm_mulhi_epu16(pix, submod);
3022 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3029 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3033 int startx = span->startx;
3034 int endx = span->endx;
3037 __m128i submod, substep, endsubmod;
3038 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3039 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3040 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3041 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3042 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3043 for (x = startx; x < endx;)
3045 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3046 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3047 if (nextsub >= endx)
3049 nextsub = endsub = endx-1;
3050 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3054 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3055 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3056 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3057 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3058 substep = _mm_packs_epi32(substep, substep);
3059 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3061 __m128i pix = _mm_srai_epi16(submod, 4);
3062 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3066 __m128i pix = _mm_srai_epi16(submod, 4);
3067 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3074 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3077 int x, startx = span->startx, endx = span->endx;
3078 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3079 localcolor = _mm_packs_epi32(localcolor, localcolor);
3080 for (x = startx;x+2 <= endx;x+=2)
3082 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3083 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3084 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3085 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3089 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3090 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3091 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3092 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3097 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3100 int x, startx = span->startx, endx = span->endx;
3101 for (x = startx;x+2 <= endx;x+=2)
3103 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3104 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3105 pix1 = _mm_mulhi_epu16(pix1, pix2);
3106 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3110 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3111 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3112 pix1 = _mm_mulhi_epu16(pix1, pix2);
3113 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3118 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3121 int x, startx = span->startx, endx = span->endx;
3122 for (x = startx;x+2 <= endx;x+=2)
3124 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3125 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3126 pix1 = _mm_add_epi16(pix1, pix2);
3127 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3131 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3132 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3133 pix1 = _mm_add_epi16(pix1, pix2);
3134 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3140 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3143 int x, startx = span->startx, endx = span->endx;
3144 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3145 tint = _mm_packs_epi32(tint, tint);
3146 for (x = startx;x+2 <= endx;x+=2)
3148 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3149 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3150 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3151 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3155 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3156 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3157 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3158 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3164 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3167 int x, startx = span->startx, endx = span->endx;
3168 for (x = startx;x+2 <= endx;x+=2)
3170 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3171 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3172 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3173 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3174 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3178 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3179 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3180 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3181 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3182 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3187 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3190 int x, startx = span->startx, endx = span->endx;
3191 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3192 localcolor = _mm_packs_epi32(localcolor, localcolor);
3193 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3194 for (x = startx;x+2 <= endx;x+=2)
3196 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3197 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3198 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3202 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3203 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3204 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3211 static void DPSOFTRAST_VertexShader_Generic(void)
3213 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3214 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3215 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3216 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3217 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3220 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3222 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3223 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3227 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3229 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3230 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3231 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3234 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3237 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3239 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3242 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3244 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3247 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3253 if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3256 for (x = span->startx;x < span->endx;x++)
3257 buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3259 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3264 static void DPSOFTRAST_VertexShader_PostProcess(void)
3266 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3268 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3271 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3273 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3274 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3278 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3279 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3281 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3282 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3284 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3285 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3287 // TODO: implement saturation
3289 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3291 // TODO: implement gammaramps
3293 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3298 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3300 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3303 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3305 // this is never called (because colormask is off when this shader is used)
3306 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3307 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3309 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3310 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3315 static void DPSOFTRAST_VertexShader_FlatColor(void)
3317 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3318 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3321 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3324 unsigned char * RESTRICT pixelmask = span->pixelmask;
3325 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3326 int x, startx = span->startx, endx = span->endx;
3327 __m128i Color_Ambientm;
3328 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3329 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3330 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3331 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3332 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3333 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3334 pixel = buffer_FragColorbgra8;
3335 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3336 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3337 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3338 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3339 for (x = startx;x < endx;x++)
3342 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3345 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3346 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3347 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3348 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3354 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3355 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3356 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3358 if (pixel == buffer_FragColorbgra8)
3359 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3365 static void DPSOFTRAST_VertexShader_VertexColor(void)
3367 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3368 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3369 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3372 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3375 unsigned char * RESTRICT pixelmask = span->pixelmask;
3376 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3377 int x, startx = span->startx, endx = span->endx;
3378 __m128i Color_Ambientm, Color_Diffusem;
3380 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3381 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3384 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3385 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3386 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3387 pixel = buffer_FragColorbgra8;
3388 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3391 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3392 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3395 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3396 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3397 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3398 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3399 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3400 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3401 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3403 __m128i color, mod, pix;
3404 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3407 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3408 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3410 data = _mm_add_ps(data, slope);
3411 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3412 data = _mm_add_ps(data, slope);
3413 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3414 data = _mm_add_ps(data, slope);
3415 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3416 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3417 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3418 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3419 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3420 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3426 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3427 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3428 mod = _mm_packs_epi32(mod, mod);
3429 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3430 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3432 if (pixel == buffer_FragColorbgra8)
3433 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3439 static void DPSOFTRAST_VertexShader_Lightmap(void)
3441 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3442 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3443 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3446 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3449 unsigned char * RESTRICT pixelmask = span->pixelmask;
3450 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3451 int x, startx = span->startx, endx = span->endx;
3452 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3453 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3454 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3456 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3457 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3460 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3461 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3462 pixel = buffer_FragColorbgra8;
3463 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3464 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3465 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3466 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3467 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3468 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3469 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3470 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3472 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3473 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3474 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3475 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3476 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3477 for (x = startx;x < endx;x++)
3479 __m128i color, lightmap, glow, pix;
3480 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3483 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3484 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3485 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3486 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3487 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3488 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3489 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3490 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3491 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3492 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3498 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3499 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3500 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3501 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3502 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3503 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3508 for (x = startx;x < endx;x++)
3510 __m128i color, lightmap, pix;
3511 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3514 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3515 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3516 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3517 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3518 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3519 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3520 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3526 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3527 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3528 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3529 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3532 if (pixel == buffer_FragColorbgra8)
3533 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3538 void DPSOFTRAST_VertexShader_LightDirection(void);
3539 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3541 static void DPSOFTRAST_VertexShader_FakeLight(void)
3543 DPSOFTRAST_VertexShader_LightDirection();
3546 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3548 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3553 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3555 DPSOFTRAST_VertexShader_LightDirection();
3556 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3559 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3561 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3566 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3568 DPSOFTRAST_VertexShader_LightDirection();
3569 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3572 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3574 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3579 void DPSOFTRAST_VertexShader_LightDirection(void)
3582 int numvertices = dpsoftrast.numvertices;
3584 float LightVector[4];
3585 float EyePosition[4];
3586 float EyeVectorModelSpace[4];
3592 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3593 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3594 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3595 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3596 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3597 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3598 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3599 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3600 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3601 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3602 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3603 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3604 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3605 for (i = 0;i < numvertices;i++)
3607 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3608 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3609 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3610 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3611 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3612 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3613 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3614 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3615 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3616 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3617 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3618 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3619 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3620 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3621 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3622 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3623 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3624 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3625 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3626 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3627 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3628 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3629 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3630 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3631 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3632 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3633 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3634 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3635 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3637 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3640 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3641 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3642 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3643 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3644 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3645 #define DPSOFTRAST_Vector3Normalize(v)\
3648 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3659 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3661 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3662 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671 int x, startx = span->startx, endx = span->endx;
3672 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3673 float LightVectordata[4];
3674 float LightVectorslope[4];
3675 float EyeVectordata[4];
3676 float EyeVectorslope[4];
3677 float VectorSdata[4];
3678 float VectorSslope[4];
3679 float VectorTdata[4];
3680 float VectorTslope[4];
3681 float VectorRdata[4];
3682 float VectorRslope[4];
3684 float diffusetex[4];
3686 float surfacenormal[4];
3687 float lightnormal[4];
3688 float lightnormal_modelspace[4];
3690 float specularnormal[4];
3693 float SpecularPower;
3695 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3696 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3697 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3698 Color_Glow[3] = 0.0f;
3699 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3700 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3701 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3702 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3703 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3704 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3705 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3706 Color_Pants[3] = 0.0f;
3707 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3708 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3709 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3710 Color_Shirt[3] = 0.0f;
3711 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3712 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3713 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3715 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3718 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3720 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3722 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3724 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3725 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3726 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3727 Color_Diffuse[3] = 0.0f;
3728 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3729 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3730 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3731 LightColor[3] = 0.0f;
3732 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3733 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3734 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3735 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3736 Color_Specular[3] = 0.0f;
3737 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3738 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3739 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3741 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3743 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3744 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3745 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3746 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3747 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3749 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3751 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3752 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3754 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3756 // nothing of this needed
3760 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3763 for (x = startx;x < endx;x++)
3766 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3767 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3768 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3769 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3770 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3772 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3773 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3774 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3775 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3777 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3778 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3779 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3780 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3781 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3782 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3783 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3784 DPSOFTRAST_Vector3Normalize(surfacenormal);
3786 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3788 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3789 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3790 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3791 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3793 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3794 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3795 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3796 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3798 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3799 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3800 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3801 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3803 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3804 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3805 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3806 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3808 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3809 DPSOFTRAST_Vector3Normalize(lightnormal);
3811 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3813 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3814 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3815 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3816 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3819 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3821 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3822 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3823 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3825 float f = 1.0f / 256.0f;
3826 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3827 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3828 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3831 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3833 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3834 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3835 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3836 DPSOFTRAST_Vector3Normalize(lightnormal);
3838 LightColor[0] = 1.0;
3839 LightColor[1] = 1.0;
3840 LightColor[2] = 1.0;
3844 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3845 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3846 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3847 DPSOFTRAST_Vector3Normalize(lightnormal);
3850 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3852 if(thread->shader_exactspecularmath)
3854 // reflect lightnormal at surfacenormal, take the negative of that
3855 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3857 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3858 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3859 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3860 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3862 // dot of this and normalize(EyeVectorFogDepth.xyz)
3863 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3864 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3865 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3866 DPSOFTRAST_Vector3Normalize(eyenormal);
3868 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3872 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3873 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3874 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3875 DPSOFTRAST_Vector3Normalize(eyenormal);
3877 specularnormal[0] = lightnormal[0] + eyenormal[0];
3878 specularnormal[1] = lightnormal[1] + eyenormal[1];
3879 specularnormal[2] = lightnormal[2] + eyenormal[2];
3880 DPSOFTRAST_Vector3Normalize(specularnormal);
3882 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3884 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3886 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3888 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3889 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3890 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3891 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3895 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3896 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3897 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3898 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3901 buffer_FragColorbgra8[x*4+0] = d[0];
3902 buffer_FragColorbgra8[x*4+1] = d[1];
3903 buffer_FragColorbgra8[x*4+2] = d[2];
3904 buffer_FragColorbgra8[x*4+3] = d[3];
3907 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3909 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3910 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3911 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3912 Color_Diffuse[3] = 0.0f;
3913 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3914 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3915 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3916 LightColor[3] = 0.0f;
3917 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3921 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3922 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3923 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3924 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3927 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3929 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3930 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3932 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3934 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3938 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3941 for (x = startx;x < endx;x++)
3944 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3945 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3946 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3947 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3948 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3949 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3950 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3951 DPSOFTRAST_Vector3Normalize(surfacenormal);
3953 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3955 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3956 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3960 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3961 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3962 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3963 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3965 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3966 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3967 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3968 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3970 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3971 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3972 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3973 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3975 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3976 DPSOFTRAST_Vector3Normalize(lightnormal);
3978 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3980 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3981 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3982 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3983 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3986 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3988 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3989 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3990 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3992 float f = 1.0f / 256.0f;
3993 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3994 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3995 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3998 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4000 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4001 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4002 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4003 DPSOFTRAST_Vector3Normalize(lightnormal);
4005 LightColor[0] = 1.0;
4006 LightColor[1] = 1.0;
4007 LightColor[2] = 1.0;
4011 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4012 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4013 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4014 DPSOFTRAST_Vector3Normalize(lightnormal);
4017 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4018 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4020 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4021 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4022 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4023 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4027 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4028 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4029 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4030 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4032 buffer_FragColorbgra8[x*4+0] = d[0];
4033 buffer_FragColorbgra8[x*4+1] = d[1];
4034 buffer_FragColorbgra8[x*4+2] = d[2];
4035 buffer_FragColorbgra8[x*4+3] = d[3];
4040 for (x = startx;x < endx;x++)
4043 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4044 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4045 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4046 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4048 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4050 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4051 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4052 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4053 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4057 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4058 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4059 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4060 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4062 buffer_FragColorbgra8[x*4+0] = d[0];
4063 buffer_FragColorbgra8[x*4+1] = d[1];
4064 buffer_FragColorbgra8[x*4+2] = d[2];
4065 buffer_FragColorbgra8[x*4+3] = d[3];
4068 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4073 static void DPSOFTRAST_VertexShader_LightSource(void)
4076 int numvertices = dpsoftrast.numvertices;
4077 float LightPosition[4];
4078 float LightVector[4];
4079 float LightVectorModelSpace[4];
4080 float EyePosition[4];
4081 float EyeVectorModelSpace[4];
4087 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4088 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4089 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4090 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4091 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4092 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4093 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4094 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4095 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4096 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4097 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4098 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4099 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4100 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4101 for (i = 0;i < numvertices;i++)
4103 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4104 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4105 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4106 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4107 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4108 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4109 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4110 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4111 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4112 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4113 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4114 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4115 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4116 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4117 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4118 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4119 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4120 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4121 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4122 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4123 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4124 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4125 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4126 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4127 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4128 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4129 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4130 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4131 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4132 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4133 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4134 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4136 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4137 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4140 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4143 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4144 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4150 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151 int x, startx = span->startx, endx = span->endx;
4152 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], /*Color_Glow[4],*/ Color_Pants[4], Color_Shirt[4], LightColor[4];
4153 float CubeVectordata[4];
4154 float CubeVectorslope[4];
4155 float LightVectordata[4];
4156 float LightVectorslope[4];
4157 float EyeVectordata[4];
4158 float EyeVectorslope[4];
4160 float diffusetex[4];
4162 float surfacenormal[4];
4163 float lightnormal[4];
4165 float specularnormal[4];
4168 float SpecularPower;
4169 float CubeVector[4];
4173 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4174 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4175 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4176 Color_Glow[3] = 0.0f;
4178 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4179 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4180 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4181 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4182 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4183 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4184 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4185 Color_Diffuse[3] = 0.0f;
4186 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4187 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4188 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4189 Color_Specular[3] = 0.0f;
4190 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4191 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4192 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4193 Color_Pants[3] = 0.0f;
4194 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4195 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4196 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4197 Color_Shirt[3] = 0.0f;
4198 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4199 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4200 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4201 LightColor[3] = 0.0f;
4202 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4203 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4204 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4205 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4206 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4207 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4208 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4209 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4211 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4215 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4216 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4218 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4219 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220 for (x = startx;x < endx;x++)
4223 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4224 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4225 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4226 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4227 if (attenuation < 0.01f)
4229 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4231 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4232 if (attenuation < 0.01f)
4236 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4237 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4238 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4239 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4240 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4242 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4243 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4244 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4245 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4247 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4248 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4249 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4250 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4251 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4252 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4253 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4254 DPSOFTRAST_Vector3Normalize(surfacenormal);
4256 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4257 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4258 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4259 DPSOFTRAST_Vector3Normalize(lightnormal);
4261 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4263 if(thread->shader_exactspecularmath)
4265 // reflect lightnormal at surfacenormal, take the negative of that
4266 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4268 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4269 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4270 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4271 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4273 // dot of this and normalize(EyeVectorFogDepth.xyz)
4274 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4275 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4276 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4277 DPSOFTRAST_Vector3Normalize(eyenormal);
4279 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4283 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4284 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4285 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4286 DPSOFTRAST_Vector3Normalize(eyenormal);
4288 specularnormal[0] = lightnormal[0] + eyenormal[0];
4289 specularnormal[1] = lightnormal[1] + eyenormal[1];
4290 specularnormal[2] = lightnormal[2] + eyenormal[2];
4291 DPSOFTRAST_Vector3Normalize(specularnormal);
4293 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4295 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4297 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4299 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4300 attenuation *= (1.0f / 255.0f);
4301 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4302 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4303 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4304 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4308 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4309 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4310 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4311 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4313 buffer_FragColorbgra8[x*4+0] = d[0];
4314 buffer_FragColorbgra8[x*4+1] = d[1];
4315 buffer_FragColorbgra8[x*4+2] = d[2];
4316 buffer_FragColorbgra8[x*4+3] = d[3];
4319 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4321 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4322 for (x = startx;x < endx;x++)
4325 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4326 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4327 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4328 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4329 if (attenuation < 0.01f)
4331 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4333 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4334 if (attenuation < 0.01f)
4338 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4339 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4340 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4341 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4342 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4344 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4345 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4346 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4347 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4349 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4350 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4351 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4352 DPSOFTRAST_Vector3Normalize(surfacenormal);
4354 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4355 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4356 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4357 DPSOFTRAST_Vector3Normalize(lightnormal);
4359 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4360 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4362 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4363 attenuation *= (1.0f / 255.0f);
4364 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4365 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4366 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4367 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4371 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4372 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4373 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4374 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4376 buffer_FragColorbgra8[x*4+0] = d[0];
4377 buffer_FragColorbgra8[x*4+1] = d[1];
4378 buffer_FragColorbgra8[x*4+2] = d[2];
4379 buffer_FragColorbgra8[x*4+3] = d[3];
4384 for (x = startx;x < endx;x++)
4387 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4388 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4389 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4390 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4391 if (attenuation < 0.01f)
4393 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4395 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4396 if (attenuation < 0.01f)
4400 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4401 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4402 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4403 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4404 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4406 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4407 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4408 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4409 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4411 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4413 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4414 attenuation *= (1.0f / 255.0f);
4415 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4416 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4417 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4418 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4422 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4423 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4424 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4425 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4427 buffer_FragColorbgra8[x*4+0] = d[0];
4428 buffer_FragColorbgra8[x*4+1] = d[1];
4429 buffer_FragColorbgra8[x*4+2] = d[2];
4430 buffer_FragColorbgra8[x*4+3] = d[3];
4433 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4439 static void DPSOFTRAST_VertexShader_Refraction(void)
4441 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4442 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4443 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4446 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4448 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4450 int x, startx = span->startx, endx = span->endx;
4453 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4454 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4457 float ModelViewProjectionPositiondata[4];
4458 float ModelViewProjectionPositionslope[4];
4461 float ScreenScaleRefractReflect[2];
4462 float ScreenCenterRefractReflect[2];
4463 float DistortScaleRefractReflect[2];
4464 float RefractColor[4];
4466 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4467 if(!texture) return;
4470 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4471 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4474 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4477 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4478 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4479 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4480 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4481 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4482 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4483 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4484 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4485 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4486 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4489 for (x = startx;x < endx;x++)
4491 float SafeScreenTexCoord[2];
4492 float ScreenTexCoord[2];
4499 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4500 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4502 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4503 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4504 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4506 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4507 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4508 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4509 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4510 DPSOFTRAST_Vector3Normalize(v);
4511 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4512 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4514 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4515 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4517 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4518 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4519 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4520 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4523 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4528 static void DPSOFTRAST_VertexShader_Water(void)
4531 int numvertices = dpsoftrast.numvertices;
4532 float EyePosition[4];
4533 float EyeVectorModelSpace[4];
4539 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4540 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4541 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4542 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4543 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4544 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4545 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4546 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4547 for (i = 0;i < numvertices;i++)
4549 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4550 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4551 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4552 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4553 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4554 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4555 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4556 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4557 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4558 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4559 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4560 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4561 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4562 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4563 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4564 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4565 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4566 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4567 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4568 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4569 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4570 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4572 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4573 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4574 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4578 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4580 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4582 int x, startx = span->startx, endx = span->endx;
4585 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4586 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4589 float ModelViewProjectionPositiondata[4];
4590 float ModelViewProjectionPositionslope[4];
4591 float EyeVectordata[4];
4592 float EyeVectorslope[4];
4595 float ScreenScaleRefractReflect[4];
4596 float ScreenCenterRefractReflect[4];
4597 float DistortScaleRefractReflect[4];
4598 float RefractColor[4];
4599 float ReflectColor[4];
4600 float ReflectFactor;
4601 float ReflectOffset;
4603 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4604 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4605 if(!texture_refraction || !texture_reflection) return;
4608 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4609 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4612 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4613 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4616 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4617 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4618 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4619 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4620 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4621 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4622 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4623 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4624 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4625 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4626 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4627 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4628 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4629 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4630 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4631 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4632 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4633 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4634 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4635 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4636 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4637 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4640 for (x = startx;x < endx;x++)
4642 float SafeScreenTexCoord[4];
4643 float ScreenTexCoord[4];
4646 unsigned char c1[4];
4647 unsigned char c2[4];
4652 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4653 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4655 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4656 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4657 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4658 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4659 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4661 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4662 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4663 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4664 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4665 DPSOFTRAST_Vector3Normalize(v);
4666 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4667 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4668 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4669 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4671 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4672 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4673 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4674 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4675 DPSOFTRAST_Vector3Normalize(v);
4676 Fresnel = 1.0f - v[2];
4677 Fresnel = min(1.0f, Fresnel);
4678 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4680 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4681 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4682 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4683 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4685 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4686 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4687 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4688 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4691 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4696 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4698 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4701 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4704 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4705 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4706 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4707 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4708 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4713 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4715 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4718 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4721 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4722 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4723 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4724 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4725 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4730 typedef struct DPSOFTRAST_ShaderModeInfo_s
4733 void (*Vertex)(void);
4734 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4735 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4736 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4738 DPSOFTRAST_ShaderModeInfo;
4740 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4742 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4743 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4744 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4745 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4746 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4747 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4748 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4749 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4750 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4751 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4752 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4753 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4754 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4755 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4756 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4757 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4758 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4761 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4766 unsigned int *depthpixel;
4770 unsigned char *pixelmask;
4771 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4772 startx = span->startx;
4774 depth = span->depthbase;
4775 depthslope = span->depthslope;
4776 pixelmask = thread->pixelmaskarray;
4777 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4779 switch(thread->fb_depthfunc)
4782 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4783 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4784 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4785 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4786 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4787 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4788 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4790 while (startx < endx && !pixelmask[startx])
4792 while (endx > startx && !pixelmask[endx-1])
4797 // no depth testing means we're just dealing with color...
4798 memset(pixelmask + startx, 1, endx - startx);
4800 span->pixelmask = pixelmask;
4801 span->startx = startx;
4805 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4807 int x, d, depth, depthslope, startx, endx;
4808 const unsigned char *pixelmask;
4809 unsigned int *depthpixel;
4810 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4812 depth = span->depthbase;
4813 depthslope = span->depthslope;
4814 pixelmask = span->pixelmask;
4815 startx = span->startx;
4817 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4818 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4824 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4827 DPSOFTRAST_State_Triangle *triangle;
4828 DPSOFTRAST_State_Span *span;
4829 for (i = 0; i < thread->numspans; i++)
4831 span = &thread->spans[i];
4832 triangle = &thread->triangles[span->triangle];
4833 DPSOFTRAST_Draw_DepthTest(thread, span);
4834 if (span->startx >= span->endx)
4836 // run pixel shader if appropriate
4837 // do this before running depthmask code, to allow the pixelshader
4838 // to clear pixelmask values for alpha testing
4839 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4840 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4841 DPSOFTRAST_Draw_DepthWrite(thread, span);
4843 thread->numspans = 0;
4846 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4848 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4851 int cullface = thread->cullface;
4852 int minx, maxx, miny, maxy;
4853 int miny1, maxy1, miny2, maxy2;
4854 __m128i fbmin, fbmax;
4855 __m128 viewportcenter, viewportscale;
4856 int firstvertex = command->firstvertex;
4857 int numvertices = command->numvertices;
4858 int numtriangles = command->numtriangles;
4859 const int *element3i = command->element3i;
4860 const unsigned short *element3s = command->element3s;
4861 int clipped = command->clipped;
4868 int starty, endy, bandy;
4872 float clip0origin, clip0slope;
4874 __m128 triangleedge1, triangleedge2, trianglenormal;
4877 DPSOFTRAST_State_Triangle *triangle;
4878 DPSOFTRAST_Texture *texture;
4879 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4880 miny = thread->fb_scissor[1];
4881 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4882 miny1 = bound(miny, thread->miny1, maxy);
4883 maxy1 = bound(miny, thread->maxy1, maxy);
4884 miny2 = bound(miny, thread->miny2, maxy);
4885 maxy2 = bound(miny, thread->maxy2, maxy);
4886 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4888 if (!ATOMIC_DECREMENT(command->refcount))
4890 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4891 MM_FREE(command->arrays);
4895 minx = thread->fb_scissor[0];
4896 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4897 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4898 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4899 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4900 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4901 screen[3] = _mm_setzero_ps();
4902 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4903 for (i = 0;i < numtriangles;i++)
4905 const float *screencoord4f = command->arrays;
4906 const float *arrays = screencoord4f + numvertices*4;
4908 // generate the 3 edges of this triangle
4909 // generate spans for the triangle - switch based on left split or right split classification of triangle
4912 e[0] = element3s[i*3+0] - firstvertex;
4913 e[1] = element3s[i*3+1] - firstvertex;
4914 e[2] = element3s[i*3+2] - firstvertex;
4918 e[0] = element3i[i*3+0] - firstvertex;
4919 e[1] = element3i[i*3+1] - firstvertex;
4920 e[2] = element3i[i*3+2] - firstvertex;
4929 #define SKIPBACKFACE \
4930 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4931 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4932 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4933 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4934 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4938 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4942 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4947 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4948 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4950 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4951 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4953 #define CLIPPEDVERTEXCOPY(k,p1) \
4954 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4956 #define GENATTRIBCOPY(attrib, p1) \
4957 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4958 #define GENATTRIBLERP(attrib, p1, p2) \
4960 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4961 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4963 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4967 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4968 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4969 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4970 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4971 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4972 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4973 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4979 // calculate distance from nearplane
4980 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4981 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4982 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4983 if (clipdist[0] >= 0.0f)
4985 if (clipdist[1] >= 0.0f)
4987 if (clipdist[2] >= 0.0f)
4990 // triangle is entirely in front of nearplane
4991 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4998 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5006 if (clipdist[2] >= 0.0f)
5008 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5015 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5022 else if (clipdist[1] >= 0.0f)
5024 if (clipdist[2] >= 0.0f)
5026 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5033 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5039 else if (clipdist[2] >= 0.0f)
5041 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5046 else continue; // triangle is entirely behind nearplane
5049 // calculate integer y coords for triangle points
5050 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5051 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5052 screenmin = _mm_min_epi16(screeni, screenir),
5053 screenmax = _mm_max_epi16(screeni, screenir);
5054 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5055 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5056 screenmin = _mm_max_epi16(screenmin, fbmin);
5057 screenmax = _mm_min_epi16(screenmax, fbmax);
5058 // skip offscreen triangles
5059 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5061 starty = _mm_extract_epi16(screenmin, 1);
5062 endy = _mm_extract_epi16(screenmax, 1)+1;
5063 if (starty >= maxy1 && endy <= miny2)
5065 screeny = _mm_srai_epi32(screeni, 16);
5068 triangle = &thread->triangles[thread->numtriangles];
5070 // calculate attribute plans for triangle data...
5071 // okay, this triangle is going to produce spans, we'd better project
5072 // the interpolants now (this is what gives perspective texturing),
5073 // this consists of simply multiplying all arrays by the W coord
5074 // (which is basically 1/Z), which will be undone per-pixel
5075 // (multiplying by Z again) to get the perspective-correct array
5078 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5079 __m128 mipedgescale, mipdensity;
5080 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5081 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5082 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5083 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5084 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5085 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5086 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5087 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5088 attribedge1 = _mm_sub_ss(w0, w1);
5089 attribedge2 = _mm_sub_ss(w2, w1);
5090 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5091 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5092 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5093 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5094 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5095 _mm_store_ss(&triangle->w[0], attribxslope);
5096 _mm_store_ss(&triangle->w[1], attribyslope);
5097 _mm_store_ss(&triangle->w[2], attriborigin);
5102 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5104 float cliporigin, clipxslope, clipyslope;
5105 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5106 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5107 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5108 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5109 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5110 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5111 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5112 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5113 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5116 clip0origin = -cliporigin/clipxslope;
5117 clip0slope = -clipyslope/clipxslope;
5118 clip0dir = clipxslope > 0 ? 1 : -1;
5120 else if(clipyslope > 0)
5122 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5123 clip0slope = dpsoftrast.fb_width;
5126 else if(clipyslope < 0)
5128 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5129 clip0slope = -dpsoftrast.fb_width;
5132 else if(clip0origin < 0) continue;
5135 mipedgescale = _mm_setzero_ps();
5136 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5138 __m128 attrib0, attrib1, attrib2;
5139 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5140 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5142 arrays += numvertices*4;
5143 GENATTRIBS(attrib0, attrib1, attrib2);
5144 attriborigin = _mm_mul_ps(attrib1, w1);
5145 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5146 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5147 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5148 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5149 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5150 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5151 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5152 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5153 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5155 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5156 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5157 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5158 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5162 memset(triangle->mip, 0, sizeof(triangle->mip));
5163 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5165 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5166 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5168 texture = thread->texbound[texunit];
5169 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5171 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5172 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5173 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5174 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5175 // this will be multiplied in the texturing routine by the texture resolution
5176 y = _mm_cvtss_si32(mipdensity);
5179 y = (int)(log((float)y)*0.5f/M_LN2);
5180 if (y > texture->mipmaps - 1)
5181 y = texture->mipmaps - 1;
5182 triangle->mip[texunit] = y;
5188 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5191 __m128 xcoords, xslope;
5192 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5193 int yccmask = _mm_movemask_epi8(ycc);
5194 int edge0p, edge0n, edge1p, edge1n;
5203 case 0xFFFF: /*0000*/ y = endy; continue;
5204 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5205 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5206 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5207 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5208 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5209 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5210 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5211 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5212 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5213 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5214 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5215 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5216 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5217 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5218 case 0x0000: /*1111*/ y++; continue;
5226 case 0xFFFF: /*000*/ y = endy; continue;
5227 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5228 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5229 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5230 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5231 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5232 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5233 case 0x0000: /*111*/ y++; continue;
5236 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5237 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5238 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5239 nexty = _mm_extract_epi16(ycc, 0);
5240 if (nexty >= bandy) nexty = bandy-1;
5241 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5242 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5243 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5244 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5245 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5246 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5248 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5249 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5251 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5252 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5254 int startx, endx, offset;
5255 startx = _mm_cvtss_si32(xcoords);
5256 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5257 if (startx < minx) startx = minx;
5258 if (endx > maxx) endx = maxx;
5259 if (startx >= endx) continue;
5267 if(endx <= clip0) continue;
5268 startx = (int)clip0;
5271 else if (endx > clip0)
5273 if(startx >= clip0) continue;
5278 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5280 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5281 span->triangle = thread->numtriangles;
5285 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5286 if (span->startx >= span->endx)
5288 wslope = triangle->w[0];
5289 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5290 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5291 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5292 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5293 DPSOFTRAST_Draw_ProcessSpans(thread);
5298 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5300 DPSOFTRAST_Draw_ProcessSpans(thread);
5301 thread->numtriangles = 0;
5305 if (!ATOMIC_DECREMENT(command->refcount))
5307 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5308 MM_FREE(command->arrays);
5311 if (thread->numspans > 0 || thread->numtriangles > 0)
5313 DPSOFTRAST_Draw_ProcessSpans(thread);
5314 thread->numtriangles = 0;
5319 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5323 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5324 int datasize = 2*numvertices*sizeof(float[4]);
5325 DPSOFTRAST_Command_Draw *command;
5326 unsigned char *data;
5327 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5329 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5330 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5332 datasize += numvertices*sizeof(float[4]);
5335 datasize += numtriangles*sizeof(unsigned short[3]);
5337 datasize += numtriangles*sizeof(int[3]);
5338 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5339 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5341 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5342 data = (unsigned char *)MM_CALLOC(datasize, 1);
5346 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5347 data = (unsigned char *)command + commandsize;
5349 command->firstvertex = firstvertex;
5350 command->numvertices = numvertices;
5351 command->numtriangles = numtriangles;
5352 command->arrays = (float *)data;
5353 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5354 dpsoftrast.firstvertex = firstvertex;
5355 dpsoftrast.numvertices = numvertices;
5356 dpsoftrast.screencoord4f = (float *)data;
5357 data += numvertices*sizeof(float[4]);
5358 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5359 data += numvertices*sizeof(float[4]);
5360 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5362 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5363 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5365 dpsoftrast.post_array4f[j] = (float *)data;
5366 data += numvertices*sizeof(float[4]);
5368 command->element3i = NULL;
5369 command->element3s = NULL;
5372 command->element3s = (unsigned short *)data;
5373 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5377 command->element3i = (int *)data;
5378 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5383 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5385 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5386 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5387 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5388 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5389 if (command->starty >= command->endy)
5391 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5392 MM_FREE(command->arrays);
5393 DPSOFTRAST_UndoCommand(command->commandsize);
5396 command->clipped = dpsoftrast.drawclipped;
5397 command->refcount = dpsoftrast.numthreads;
5399 if (dpsoftrast.usethreads)
5402 DPSOFTRAST_Draw_SyncCommands();
5403 for (i = 0; i < dpsoftrast.numthreads; i++)
5405 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5406 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5407 Thread_CondSignal(thread->drawcond);
5412 DPSOFTRAST_Draw_FlushThreads();
5416 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5417 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5419 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5421 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5423 DPSOFTRAST_Command_SetRenderTargets *command;
5424 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5425 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5426 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5428 dpsoftrast.fb_width = width;
5429 dpsoftrast.fb_height = height;
5430 dpsoftrast.fb_depthpixels = depthpixels;
5431 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5432 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5433 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5434 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5435 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5436 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5437 command->width = width;
5438 command->height = height;
5441 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5443 int commandoffset = thread->commandoffset;
5444 while (commandoffset != endoffset)
5446 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5447 switch (command->opcode)
5449 #define INTERPCOMMAND(name) \
5450 case DPSOFTRAST_OPCODE_##name : \
5451 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5452 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5453 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5454 commandoffset = 0; \
5456 INTERPCOMMAND(Viewport)
5457 INTERPCOMMAND(ClearColor)
5458 INTERPCOMMAND(ClearDepth)
5459 INTERPCOMMAND(ColorMask)
5460 INTERPCOMMAND(DepthTest)
5461 INTERPCOMMAND(ScissorTest)
5462 INTERPCOMMAND(Scissor)
5463 INTERPCOMMAND(BlendFunc)
5464 INTERPCOMMAND(BlendSubtract)
5465 INTERPCOMMAND(DepthMask)
5466 INTERPCOMMAND(DepthFunc)
5467 INTERPCOMMAND(DepthRange)
5468 INTERPCOMMAND(PolygonOffset)
5469 INTERPCOMMAND(CullFace)
5470 INTERPCOMMAND(SetTexture)
5471 INTERPCOMMAND(SetShader)
5472 INTERPCOMMAND(Uniform4f)
5473 INTERPCOMMAND(UniformMatrix4f)
5474 INTERPCOMMAND(Uniform1i)
5475 INTERPCOMMAND(SetRenderTargets)
5476 INTERPCOMMAND(ClipPlane)
5478 case DPSOFTRAST_OPCODE_Draw:
5479 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5480 commandoffset += command->commandsize;
5481 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5483 thread->commandoffset = commandoffset;
5486 case DPSOFTRAST_OPCODE_Reset:
5491 thread->commandoffset = commandoffset;
5494 static int DPSOFTRAST_Draw_Thread(void *data)
5496 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5497 while(thread->index >= 0)
5499 if (thread->commandoffset != dpsoftrast.drawcommand)
5501 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5505 Thread_LockMutex(thread->drawmutex);
5506 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5508 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5509 thread->starving = true;
5510 Thread_CondWait(thread->drawcond, thread->drawmutex);
5511 thread->starving = false;
5513 Thread_UnlockMutex(thread->drawmutex);
5519 static void DPSOFTRAST_Draw_FlushThreads(void)
5521 DPSOFTRAST_State_Thread *thread;
5523 DPSOFTRAST_Draw_SyncCommands();
5524 if (dpsoftrast.usethreads)
5526 for (i = 0; i < dpsoftrast.numthreads; i++)
5528 thread = &dpsoftrast.threads[i];
5529 if (thread->commandoffset != dpsoftrast.drawcommand)
5531 Thread_LockMutex(thread->drawmutex);
5532 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5533 Thread_CondSignal(thread->drawcond);
5534 Thread_UnlockMutex(thread->drawmutex);
5537 for (i = 0; i < dpsoftrast.numthreads; i++)
5539 thread = &dpsoftrast.threads[i];
5540 if (thread->commandoffset != dpsoftrast.drawcommand)
5542 Thread_LockMutex(thread->drawmutex);
5543 if (thread->commandoffset != dpsoftrast.drawcommand)
5545 thread->waiting = true;
5546 Thread_CondWait(thread->waitcond, thread->drawmutex);
5547 thread->waiting = false;
5549 Thread_UnlockMutex(thread->drawmutex);
5555 for (i = 0; i < dpsoftrast.numthreads; i++)
5557 thread = &dpsoftrast.threads[i];
5558 if (thread->commandoffset != dpsoftrast.drawcommand)
5559 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5562 dpsoftrast.commandpool.usedcommands = 0;
5565 void DPSOFTRAST_Flush(void)
5567 DPSOFTRAST_Draw_FlushThreads();
5570 void DPSOFTRAST_Finish(void)
5575 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5585 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5586 dpsoftrast.bigendian = u.b[3];
5587 dpsoftrast.fb_width = width;
5588 dpsoftrast.fb_height = height;
5589 dpsoftrast.fb_depthpixels = depthpixels;
5590 dpsoftrast.fb_colorpixels[0] = colorpixels;
5591 dpsoftrast.fb_colorpixels[1] = NULL;
5592 dpsoftrast.fb_colorpixels[1] = NULL;
5593 dpsoftrast.fb_colorpixels[1] = NULL;
5594 dpsoftrast.viewport[0] = 0;
5595 dpsoftrast.viewport[1] = 0;
5596 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5597 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5598 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5599 dpsoftrast.texture_firstfree = 1;
5600 dpsoftrast.texture_end = 1;
5601 dpsoftrast.texture_max = 0;
5602 dpsoftrast.color[0] = 1;
5603 dpsoftrast.color[1] = 1;
5604 dpsoftrast.color[2] = 1;
5605 dpsoftrast.color[3] = 1;
5606 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5607 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5608 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5609 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5610 for (i = 0; i < dpsoftrast.numthreads; i++)
5612 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5614 thread->cullface = GL_BACK;
5615 thread->colormask[0] = 1;
5616 thread->colormask[1] = 1;
5617 thread->colormask[2] = 1;
5618 thread->colormask[3] = 1;
5619 thread->blendfunc[0] = GL_ONE;
5620 thread->blendfunc[1] = GL_ZERO;
5621 thread->depthmask = true;
5622 thread->depthtest = true;
5623 thread->depthfunc = GL_LEQUAL;
5624 thread->scissortest = false;
5625 thread->viewport[0] = 0;
5626 thread->viewport[1] = 0;
5627 thread->viewport[2] = dpsoftrast.fb_width;
5628 thread->viewport[3] = dpsoftrast.fb_height;
5629 thread->scissor[0] = 0;
5630 thread->scissor[1] = 0;
5631 thread->scissor[2] = dpsoftrast.fb_width;
5632 thread->scissor[3] = dpsoftrast.fb_height;
5633 thread->depthrange[0] = 0;
5634 thread->depthrange[1] = 1;
5635 thread->polygonoffset[0] = 0;
5636 thread->polygonoffset[1] = 0;
5637 thread->clipplane[0] = 0;
5638 thread->clipplane[1] = 0;
5639 thread->clipplane[2] = 0;
5640 thread->clipplane[3] = 1;
5642 thread->numspans = 0;
5643 thread->numtriangles = 0;
5644 thread->commandoffset = 0;
5645 thread->waiting = false;
5646 thread->starving = false;
5648 thread->validate = -1;
5649 DPSOFTRAST_Validate(thread, -1);
5651 if (dpsoftrast.usethreads)
5653 thread->waitcond = Thread_CreateCond();
5654 thread->drawcond = Thread_CreateCond();
5655 thread->drawmutex = Thread_CreateMutex();
5656 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5662 void DPSOFTRAST_Shutdown(void)
5665 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5667 DPSOFTRAST_State_Thread *thread;
5668 for (i = 0; i < dpsoftrast.numthreads; i++)
5670 thread = &dpsoftrast.threads[i];
5671 Thread_LockMutex(thread->drawmutex);
5673 Thread_CondSignal(thread->drawcond);
5674 Thread_UnlockMutex(thread->drawmutex);
5675 Thread_WaitThread(thread->thread, 0);
5676 Thread_DestroyCond(thread->waitcond);
5677 Thread_DestroyCond(thread->drawcond);
5678 Thread_DestroyMutex(thread->drawmutex);
5681 for (i = 0;i < dpsoftrast.texture_end;i++)
5682 if (dpsoftrast.texture[i].bytes)
5683 MM_FREE(dpsoftrast.texture[i].bytes);
5684 if (dpsoftrast.texture)
5685 free(dpsoftrast.texture);
5686 if (dpsoftrast.threads)
5687 MM_FREE(dpsoftrast.threads);
5688 memset(&dpsoftrast, 0, sizeof(dpsoftrast));