3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 dst -= texture->mipmap[0][2] * 4;
755 memcpy(dst, pixels, blockwidth * 4);
756 pixels += blockwidth * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 int i, stride = texture->mipmap[0][2]*4;
771 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772 for (i = texture->mipmap[0][3];i > 0;i--)
775 memcpy(dst, pixels, stride);
779 DPSOFTRAST_Texture_CalculateMipmaps(index);
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
783 DPSOFTRAST_Texture *texture;
784 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->mipmap[mip][2];
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791 return texture->mipmap[mip][3];
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
795 DPSOFTRAST_Texture *texture;
796 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797 return texture->mipmap[mip][4];
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
801 DPSOFTRAST_Texture *texture;
802 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
805 return texture->bytes + texture->mipmap[mip][0];
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
809 DPSOFTRAST_Texture *texture;
810 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
813 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
818 texture->filter = filter;
821 static void DPSOFTRAST_Draw_FlushThreads(void);
823 static void DPSOFTRAST_Draw_SyncCommands(void)
825 if(dpsoftrast.usethreads) MEMORY_BARRIER;
826 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
831 DPSOFTRAST_State_Thread *thread;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
837 DPSOFTRAST_Draw_SyncCommands();
843 for (i = 0; i < dpsoftrast.numthreads; i++)
845 thread = &dpsoftrast.threads[i];
846 commandoffset = freecommand - thread->commandoffset;
847 if (commandoffset < 0)
848 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849 if (commandoffset > usedcommands)
852 usedcommands = commandoffset;
855 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
857 thread = &dpsoftrast.threads[waitindex];
858 Thread_LockMutex(thread->drawmutex);
859 if (thread->commandoffset != dpsoftrast.drawcommand)
861 thread->waiting = true;
862 if (thread->starving) Thread_CondSignal(thread->drawcond);
863 Thread_CondWait(thread->waitcond, thread->drawmutex);
864 thread->waiting = false;
866 Thread_UnlockMutex(thread->drawmutex);
868 dpsoftrast.commandpool.usedcommands = usedcommands;
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
878 DPSOFTRAST_Command *command;
879 int freecommand = dpsoftrast.commandpool.freecommand;
880 int usedcommands = dpsoftrast.commandpool.usedcommands;
881 int extra = sizeof(DPSOFTRAST_Command);
882 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
886 if (dpsoftrast.usethreads)
887 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
889 DPSOFTRAST_Draw_FlushThreads();
890 freecommand = dpsoftrast.commandpool.freecommand;
891 usedcommands = dpsoftrast.commandpool.usedcommands;
893 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
895 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896 command->opcode = DPSOFTRAST_OPCODE_Reset;
897 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
900 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901 command->opcode = opcode;
902 command->commandsize = size;
904 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
906 dpsoftrast.commandpool.freecommand = freecommand;
907 dpsoftrast.commandpool.usedcommands = usedcommands + size;
911 static void DPSOFTRAST_UndoCommand(int size)
913 int freecommand = dpsoftrast.commandpool.freecommand;
914 int usedcommands = dpsoftrast.commandpool.usedcommands;
917 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918 usedcommands -= size;
919 dpsoftrast.commandpool.freecommand = freecommand;
920 dpsoftrast.commandpool.usedcommands = usedcommands;
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
926 thread->viewport[0] = command->x;
927 thread->viewport[1] = command->y;
928 thread->viewport[2] = command->width;
929 thread->viewport[3] = command->height;
930 thread->validate |= DPSOFTRAST_VALIDATE_FB;
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
934 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
937 command->width = width;
938 command->height = height;
940 dpsoftrast.viewport[0] = x;
941 dpsoftrast.viewport[1] = y;
942 dpsoftrast.viewport[2] = width;
943 dpsoftrast.viewport[3] = height;
944 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
950 int i, x1, y1, x2, y2, w, h, x, y;
951 int miny1, maxy1, miny2, maxy2;
955 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956 miny1 = thread->miny1;
957 maxy1 = thread->maxy1;
958 miny2 = thread->miny2;
959 maxy2 = thread->maxy2;
960 x1 = thread->fb_scissor[0];
961 y1 = thread->fb_scissor[1];
962 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964 if (y1 < miny1) y1 = miny1;
965 if (y2 > maxy2) y2 = maxy2;
970 // FIXME: honor fb_colormask?
971 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972 for (i = 0;i < 4;i++)
974 if (!dpsoftrast.fb_colorpixels[i])
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
987 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
997 int x1, y1, x2, y2, w, h, x, y;
998 int miny1, maxy1, miny2, maxy2;
1002 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003 miny1 = thread->miny1;
1004 maxy1 = thread->maxy1;
1005 miny2 = thread->miny2;
1006 maxy2 = thread->maxy2;
1007 x1 = thread->fb_scissor[0];
1008 y1 = thread->fb_scissor[1];
1009 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011 if (y1 < miny1) y1 = miny1;
1012 if (y2 > maxy2) y2 = maxy2;
1017 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019 for (;y < bandy;y++)
1021 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022 for (x = x1;x < x2;x++)
1026 void DPSOFTRAST_ClearDepth(float d)
1028 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1035 thread->colormask[0] = command->r != 0;
1036 thread->colormask[1] = command->g != 0;
1037 thread->colormask[2] = command->b != 0;
1038 thread->colormask[3] = command->a != 0;
1039 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1043 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1053 thread->depthtest = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1056 void DPSOFTRAST_DepthTest(int enable)
1058 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059 command->enable = enable;
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1065 thread->scissortest = command->enable;
1066 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1068 void DPSOFTRAST_ScissorTest(int enable)
1070 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071 command->enable = enable;
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1077 thread->scissor[0] = command->x;
1078 thread->scissor[1] = command->y;
1079 thread->scissor[2] = command->width;
1080 thread->scissor[3] = command->height;
1081 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1085 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1088 command->width = width;
1089 command->height = height;
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1095 thread->blendfunc[0] = command->sfactor;
1096 thread->blendfunc[1] = command->dfactor;
1097 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1101 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102 command->sfactor = sfactor;
1103 command->dfactor = dfactor;
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1109 thread->blendsubtract = command->enable;
1110 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1112 void DPSOFTRAST_BlendSubtract(int enable)
1114 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115 command->enable = enable;
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1121 thread->depthmask = command->enable;
1123 void DPSOFTRAST_DepthMask(int enable)
1125 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126 command->enable = enable;
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1132 thread->depthfunc = command->func;
1134 void DPSOFTRAST_DepthFunc(int func)
1136 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137 command->func = func;
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1143 thread->depthrange[0] = command->nearval;
1144 thread->depthrange[1] = command->farval;
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1148 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149 command->nearval = nearval;
1150 command->farval = farval;
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1156 thread->polygonoffset[0] = command->alongnormal;
1157 thread->polygonoffset[1] = command->intoview;
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1161 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162 command->alongnormal = alongnormal;
1163 command->intoview = intoview;
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1169 thread->cullface = command->mode;
1171 void DPSOFTRAST_CullFace(int mode)
1173 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174 command->mode = mode;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - sy1 - th;
1282 ty1 = theight - ty1 - th;
1283 for (y = 0;y < th;y++)
1284 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285 if (texture->mipmaps > 1)
1286 DPSOFTRAST_Texture_CalculateMipmaps(index);
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1292 if (thread->texbound[command->unitnum])
1293 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294 thread->texbound[command->unitnum] = command->texture;
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1298 DPSOFTRAST_Command_SetTexture *command;
1299 DPSOFTRAST_Texture *texture;
1300 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1302 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1305 texture = DPSOFTRAST_Texture_GetByIndex(index);
1306 if (index && !texture)
1308 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313 command->unitnum = unitnum;
1314 command->texture = texture;
1316 dpsoftrast.texbound[unitnum] = texture;
1318 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1323 dpsoftrast.pointer_vertex3f = vertex3f;
1324 dpsoftrast.stride_vertex = stride;
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1328 dpsoftrast.pointer_color4f = color4f;
1329 dpsoftrast.pointer_color4ub = NULL;
1330 dpsoftrast.stride_color = stride;
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1334 dpsoftrast.pointer_color4f = NULL;
1335 dpsoftrast.pointer_color4ub = color4ub;
1336 dpsoftrast.stride_color = stride;
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1340 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342 dpsoftrast.stride_texcoord[unitnum] = stride;
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1348 thread->shader_mode = command->mode;
1349 thread->shader_permutation = command->permutation;
1350 thread->shader_exactspecularmath = command->exactspecularmath;
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1354 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355 command->mode = mode;
1356 command->permutation = permutation;
1357 command->exactspecularmath = exactspecularmath;
1359 dpsoftrast.shader_mode = mode;
1360 dpsoftrast.shader_permutation = permutation;
1361 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1367 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1371 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372 command->index = index;
1373 command->val[0] = v0;
1374 command->val[1] = v1;
1375 command->val[2] = v2;
1376 command->val[3] = v3;
1378 dpsoftrast.uniform4f[index*4+0] = v0;
1379 dpsoftrast.uniform4f[index*4+1] = v1;
1380 dpsoftrast.uniform4f[index*4+2] = v2;
1381 dpsoftrast.uniform4f[index*4+3] = v3;
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1385 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386 command->index = index;
1387 memcpy(command->val, v, sizeof(command->val));
1389 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1395 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1401 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1403 __m128 m0, m1, m2, m3;
1404 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405 command->index = (DPSOFTRAST_UNIFORM)index;
1406 if (((size_t)v)&(ALIGN_SIZE-1))
1408 m0 = _mm_loadu_ps(v);
1409 m1 = _mm_loadu_ps(v+4);
1410 m2 = _mm_loadu_ps(v+8);
1411 m3 = _mm_loadu_ps(v+12);
1415 m0 = _mm_load_ps(v);
1416 m1 = _mm_load_ps(v+4);
1417 m2 = _mm_load_ps(v+8);
1418 m3 = _mm_load_ps(v+12);
1422 __m128 t0, t1, t2, t3;
1423 t0 = _mm_unpacklo_ps(m0, m1);
1424 t1 = _mm_unpacklo_ps(m2, m3);
1425 t2 = _mm_unpackhi_ps(m0, m1);
1426 t3 = _mm_unpackhi_ps(m2, m3);
1427 m0 = _mm_movelh_ps(t0, t1);
1428 m1 = _mm_movehl_ps(t1, t0);
1429 m2 = _mm_movelh_ps(t2, t3);
1430 m3 = _mm_movehl_ps(t3, t2);
1432 _mm_store_ps(command->val, m0);
1433 _mm_store_ps(command->val+4, m1);
1434 _mm_store_ps(command->val+8, m2);
1435 _mm_store_ps(command->val+12, m3);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1447 thread->uniform1i[command->index] = command->val;
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1451 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452 command->index = index;
1455 dpsoftrast.uniform1i[command->index] = i0;
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1461 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1466 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467 command->clipplane[0] = x;
1468 command->clipplane[1] = y;
1469 command->clipplane[2] = z;
1470 command->clipplane[3] = w;
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1476 float *end = dst + size*4;
1477 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1481 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1490 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1499 float *end = dst + size*4;
1500 if (stride == sizeof(float[3]))
1502 float *end4 = dst + (size&~3)*4;
1503 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1507 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1508 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521 src += 4*sizeof(float[3]);
1528 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1542 src += 4*sizeof(float[3]);
1546 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554 _mm_store_ps(dst, v);
1563 __m128 v = _mm_load_ps((const float *)src);
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567 _mm_store_ps(dst, v);
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1576 float *end = dst + size*4;
1577 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578 if (stride == sizeof(float[2]))
1580 float *end2 = dst + (size&~1)*4;
1581 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585 __m128 v = _mm_loadu_ps((const float *)src);
1586 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589 src += 2*sizeof(float[2]);
1596 __m128 v = _mm_load_ps((const float *)src);
1597 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1600 src += 2*sizeof(float[2]);
1606 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1614 float *end = dst + size*4;
1615 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616 if (stride == sizeof(unsigned char[4]))
1618 float *end4 = dst + (size&~3)*4;
1619 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1623 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1629 src += 4*sizeof(unsigned char[4]);
1636 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1642 src += 4*sizeof(unsigned char[4]);
1648 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1657 float *end = dst + 4*size;
1658 __m128 v = _mm_loadu_ps(src);
1661 _mm_store_ps(dst, v);
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671 __m128 m0, m1, m2, m3;
1673 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1675 // fast case for identity matrix
1676 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 end = out4f + numitems*4;
1680 m0 = _mm_loadu_ps(inmatrix16f);
1681 m1 = _mm_loadu_ps(inmatrix16f + 4);
1682 m2 = _mm_loadu_ps(inmatrix16f + 8);
1683 m3 = _mm_loadu_ps(inmatrix16f + 12);
1684 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1688 __m128 v = _mm_loadu_ps(in4f);
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1702 __m128 v = _mm_load_ps(in4f);
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2061 pixeli += span->y * dpsoftrast.fb_width + span->x;
2062 // handle alphatest now (this affects depth writes too)
2063 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2064 for (x = startx;x < endx;x++)
2065 if (in4ub[x*4+3] < 128)
2066 pixelmask[x] = false;
2067 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2068 // helps sprites, text and hud artwork
2069 switch(thread->fb_blendmode)
2071 case DPSOFTRAST_BLENDMODE_ALPHA:
2072 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2073 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2075 for (x = startx;x < endx;x++)
2077 if (in4ub[x*4+3] >= 1)
2082 while (++x < endx && in4ub[x*4+3] >= 1) ;
2084 if (x >= endx) break;
2086 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2087 if (x >= endx) break;
2094 case DPSOFTRAST_BLENDMODE_OPAQUE:
2095 case DPSOFTRAST_BLENDMODE_ADD:
2096 case DPSOFTRAST_BLENDMODE_INVMOD:
2097 case DPSOFTRAST_BLENDMODE_MUL:
2098 case DPSOFTRAST_BLENDMODE_MUL2:
2099 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2100 case DPSOFTRAST_BLENDMODE_INVADD:
2103 // put some special values at the end of the mask to ensure the loops end
2104 pixelmask[endx] = 1;
2105 pixelmask[endx+1] = 0;
2106 // LordHavoc: use a double loop to identify subspans, this helps the
2107 // optimized copy/blend loops to perform at their best, most triangles
2108 // have only one run of pixels, and do the search using wide reads...
2112 // if this pixel is masked off, it's probably not alone...
2119 // the 4-item search must be aligned or else it stalls badly
2120 if ((x & 3) && !pixelmask[x])
2122 if(pixelmask[x]) goto endmasked;
2126 if(pixelmask[x]) goto endmasked;
2130 if(pixelmask[x]) goto endmasked;
2135 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2139 for (;!pixelmask[x];x++)
2141 // rather than continue the loop, just check the end variable
2146 // find length of subspan
2149 if (subx + 8 < endx)
2153 if(!pixelmask[subx]) goto endunmasked;
2157 if(!pixelmask[subx]) goto endunmasked;
2161 if(!pixelmask[subx]) goto endunmasked;
2166 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2170 for (;pixelmask[subx];subx++)
2172 // the checks can overshoot, so make sure to clip it...
2176 // now that we know the subspan length... process!
2177 switch(thread->fb_blendmode)
2179 case DPSOFTRAST_BLENDMODE_OPAQUE:
2183 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2188 while (x + 16 <= subx)
2190 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2191 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2192 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2198 while (x + 4 <= subx)
2200 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2206 pixeli[x+1] = ini[x+1];
2216 case DPSOFTRAST_BLENDMODE_ALPHA:
2217 #define FINISHBLEND(blend2, blend1) \
2218 for (;x + 1 < subx;x += 2) \
2221 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2222 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2224 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2229 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2230 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2232 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2245 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2246 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2248 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2249 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252 case DPSOFTRAST_BLENDMODE_ADD:
2253 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2255 case DPSOFTRAST_BLENDMODE_INVMOD:
2257 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262 case DPSOFTRAST_BLENDMODE_MUL:
2263 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2265 case DPSOFTRAST_BLENDMODE_MUL2:
2266 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2268 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2279 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2280 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2282 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2283 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286 case DPSOFTRAST_BLENDMODE_INVADD:
2288 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2298 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2299 // warning: this is SLOW, only use if the optimized per-span functions won't do
2301 const unsigned char * RESTRICT pixelbase;
2302 const unsigned char * RESTRICT pixel[4];
2303 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2304 int wrapmask[2] = { width-1, height-1 };
2305 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2306 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2308 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2309 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2310 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2311 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2312 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2313 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2314 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2316 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2317 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2318 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2319 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2323 tci[0] &= wrapmask[0];
2324 tci[1] &= wrapmask[1];
2325 tci1[0] &= wrapmask[0];
2326 tci1[1] &= wrapmask[1];
2328 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2329 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2330 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2331 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2332 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2333 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2334 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2335 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2339 int tci[2] = { x * width, y * height };
2340 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2342 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2343 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2347 tci[0] &= wrapmask[0];
2348 tci[1] &= wrapmask[1];
2350 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2359 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2362 int startx = span->startx;
2363 int endx = span->endx;
2368 float tc[2], endtc[2];
2370 unsigned int tci[2];
2371 unsigned int tci1[2];
2372 unsigned int tcimin[2];
2373 unsigned int tcimax[2];
2378 const unsigned char * RESTRICT pixelbase;
2379 const unsigned char * RESTRICT pixel[4];
2380 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2381 // if no texture is bound, just fill it with white
2384 for (x = startx;x < endx;x++)
2386 out4f[x*4+0] = 1.0f;
2387 out4f[x*4+1] = 1.0f;
2388 out4f[x*4+2] = 1.0f;
2389 out4f[x*4+3] = 1.0f;
2393 mip = triangle->mip[texunitindex];
2394 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2395 // if this mipmap of the texture is 1 pixel, just fill it with that color
2396 if (texture->mipmap[mip][1] == 4)
2398 c[0] = texture->bytes[2] * (1.0f/255.0f);
2399 c[1] = texture->bytes[1] * (1.0f/255.0f);
2400 c[2] = texture->bytes[0] * (1.0f/255.0f);
2401 c[3] = texture->bytes[3] * (1.0f/255.0f);
2402 for (x = startx;x < endx;x++)
2404 out4f[x*4+0] = c[0];
2405 out4f[x*4+1] = c[1];
2406 out4f[x*4+2] = c[2];
2407 out4f[x*4+3] = c[3];
2411 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2412 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2413 flags = texture->flags;
2414 tcscale[0] = texture->mipmap[mip][2];
2415 tcscale[1] = texture->mipmap[mip][3];
2416 tciwidth = -texture->mipmap[mip][2];
2419 tcimax[0] = texture->mipmap[mip][2]-1;
2420 tcimax[1] = texture->mipmap[mip][3]-1;
2421 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2422 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2423 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2424 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2430 for (x = startx;x < endx;)
2432 unsigned int subtc[2];
2433 unsigned int substep[2];
2434 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2435 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2436 if (nextsub >= endx)
2438 nextsub = endsub = endx-1;
2439 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2443 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2444 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2450 substep[0] = (endtc[0] - tc[0]) * subscale;
2451 substep[1] = (endtc[1] - tc[1]) * subscale;
2452 subtc[0] = tc[0] * (1<<12);
2453 subtc[1] = tc[1] * (1<<12);
2456 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2458 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2460 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2461 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2462 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2463 tci[0] = subtc[0]>>12;
2464 tci[1] = subtc[1]>>12;
2465 tci1[0] = tci[0] + 1;
2466 tci1[1] = tci[1] + 1;
2467 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2468 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2469 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2470 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2471 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2472 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2473 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2474 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2475 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2476 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2477 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2479 out4f[x*4+0] = c[0];
2480 out4f[x*4+1] = c[1];
2481 out4f[x*4+2] = c[2];
2482 out4f[x*4+3] = c[3];
2487 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2489 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2490 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2491 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2492 tci[0] = subtc[0]>>12;
2493 tci[1] = subtc[1]>>12;
2494 tci1[0] = tci[0] + 1;
2495 tci1[1] = tci[1] + 1;
2496 tci[0] &= tciwrapmask[0];
2497 tci[1] &= tciwrapmask[1];
2498 tci1[0] &= tciwrapmask[0];
2499 tci1[1] &= tciwrapmask[1];
2500 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2501 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2502 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2503 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2504 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2505 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2506 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2507 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2508 out4f[x*4+0] = c[0];
2509 out4f[x*4+1] = c[1];
2510 out4f[x*4+2] = c[2];
2511 out4f[x*4+3] = c[3];
2515 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2517 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2519 tci[0] = subtc[0]>>12;
2520 tci[1] = subtc[1]>>12;
2521 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2522 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2523 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2524 c[0] = pixel[0][2] * (1.0f / 255.0f);
2525 c[1] = pixel[0][1] * (1.0f / 255.0f);
2526 c[2] = pixel[0][0] * (1.0f / 255.0f);
2527 c[3] = pixel[0][3] * (1.0f / 255.0f);
2528 out4f[x*4+0] = c[0];
2529 out4f[x*4+1] = c[1];
2530 out4f[x*4+2] = c[2];
2531 out4f[x*4+3] = c[3];
2536 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2538 tci[0] = subtc[0]>>12;
2539 tci[1] = subtc[1]>>12;
2540 tci[0] &= tciwrapmask[0];
2541 tci[1] &= tciwrapmask[1];
2542 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2543 c[0] = pixel[0][2] * (1.0f / 255.0f);
2544 c[1] = pixel[0][1] * (1.0f / 255.0f);
2545 c[2] = pixel[0][0] * (1.0f / 255.0f);
2546 c[3] = pixel[0][3] * (1.0f / 255.0f);
2547 out4f[x*4+0] = c[0];
2548 out4f[x*4+1] = c[1];
2549 out4f[x*4+2] = c[2];
2550 out4f[x*4+3] = c[3];
2557 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2561 int startx = span->startx;
2562 int endx = span->endx;
2564 __m128 data, slope, tcscale;
2565 __m128i tcsize, tcmask, tcoffset, tcmax;
2567 __m128i subtc, substep, endsubtc;
2570 int affine; // LordHavoc: optimized affine texturing case
2571 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2572 const unsigned char * RESTRICT pixelbase;
2573 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2574 // if no texture is bound, just fill it with white
2577 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2580 mip = triangle->mip[texunitindex];
2581 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2582 // if this mipmap of the texture is 1 pixel, just fill it with that color
2583 if (texture->mipmap[mip][1] == 4)
2585 unsigned int k = *((const unsigned int *)pixelbase);
2586 for (x = startx;x < endx;x++)
2590 affine = zf[startx] == zf[endx-1];
2591 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2592 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2593 flags = texture->flags;
2594 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2595 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2596 tcscale = _mm_cvtepi32_ps(tcsize);
2597 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2598 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2599 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2601 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2602 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2603 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2604 tcmax = _mm_packs_epi32(tcmask, tcmask);
2605 for (x = startx;x < endx;)
2607 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2608 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2609 if (nextsub >= endx || affine)
2611 nextsub = endsub = endx-1;
2612 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2616 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2618 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2619 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2620 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2621 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2622 substep = _mm_slli_epi32(substep, 1);
2625 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2626 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2628 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2629 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2631 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2632 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2633 tci = _mm_madd_epi16(tci, tcoffset);
2634 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2635 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2636 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2637 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2638 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2639 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2640 fracm = _mm_srli_epi16(subtc, 1);
2641 pix1 = _mm_add_epi16(pix1,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644 pix3 = _mm_add_epi16(pix3,
2645 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2646 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2647 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2648 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2649 pix2 = _mm_add_epi16(pix2,
2650 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2651 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2652 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2656 const unsigned char * RESTRICT ptr1;
2657 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2660 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2661 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2662 fracm = _mm_srli_epi16(subtc, 1);
2663 pix1 = _mm_add_epi16(pix1,
2664 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2665 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2666 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2667 pix1 = _mm_add_epi16(pix1,
2668 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2670 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2674 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2676 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2678 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2679 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2680 tci = _mm_madd_epi16(tci, tcoffset);
2681 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683 _mm_setzero_si128());
2684 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686 _mm_setzero_si128());
2687 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2688 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2689 tci = _mm_madd_epi16(tci, tcoffset);
2690 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692 _mm_setzero_si128());
2693 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695 _mm_setzero_si128());
2696 fracm = _mm_srli_epi16(subtc, 1);
2697 pix1 = _mm_add_epi16(pix1,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2700 pix3 = _mm_add_epi16(pix3,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2702 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2703 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2704 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2705 pix2 = _mm_add_epi16(pix2,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2707 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2708 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2712 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2713 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2714 tci = _mm_madd_epi16(tci, tcoffset);
2715 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2716 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2717 _mm_setzero_si128());
2718 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2719 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2720 _mm_setzero_si128());
2721 fracm = _mm_srli_epi16(subtc, 1);
2722 pix1 = _mm_add_epi16(pix1,
2723 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726 pix1 = _mm_add_epi16(pix1,
2727 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2735 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742 _mm_setzero_si128());
2743 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745 _mm_setzero_si128());
2746 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751 _mm_setzero_si128());
2752 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754 _mm_setzero_si128());
2755 fracm = _mm_srli_epi16(subtc, 1);
2756 pix1 = _mm_add_epi16(pix1,
2757 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759 pix3 = _mm_add_epi16(pix3,
2760 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764 pix2 = _mm_add_epi16(pix2,
2765 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2771 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2773 tci = _mm_madd_epi16(tci, tcoffset);
2774 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2775 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2776 _mm_setzero_si128());
2777 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2778 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2779 _mm_setzero_si128());
2780 fracm = _mm_srli_epi16(subtc, 1);
2781 pix1 = _mm_add_epi16(pix1,
2782 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785 pix1 = _mm_add_epi16(pix1,
2786 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2795 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2797 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2799 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2800 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2801 tci = _mm_madd_epi16(tci, tcoffset);
2802 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2807 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2808 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2809 tci = _mm_madd_epi16(tci, tcoffset);
2810 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2816 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2818 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2819 tci = _mm_and_si128(tci, tcmax);
2820 tci = _mm_madd_epi16(tci, tcoffset);
2821 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2822 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2826 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2827 tci = _mm_and_si128(tci, tcmax);
2828 tci = _mm_madd_epi16(tci, tcoffset);
2829 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2838 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2841 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2844 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2851 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2854 int startx = span->startx;
2855 int endx = span->endx;
2860 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2861 for (x = startx;x < endx;x++)
2864 c[0] = (data[0] + slope[0]*x) * z;
2865 c[1] = (data[1] + slope[1]*x) * z;
2866 c[2] = (data[2] + slope[2]*x) * z;
2867 c[3] = (data[3] + slope[3]*x) * z;
2868 out4f[x*4+0] = in4f[x*4+0] * c[0];
2869 out4f[x*4+1] = in4f[x*4+1] * c[1];
2870 out4f[x*4+2] = in4f[x*4+2] * c[2];
2871 out4f[x*4+3] = in4f[x*4+3] * c[3];
2877 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2880 int startx = span->startx;
2881 int endx = span->endx;
2886 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2887 for (x = startx;x < endx;x++)
2890 c[0] = (data[0] + slope[0]*x) * z;
2891 c[1] = (data[1] + slope[1]*x) * z;
2892 c[2] = (data[2] + slope[2]*x) * z;
2893 c[3] = (data[3] + slope[3]*x) * z;
2894 out4f[x*4+0] = c[0];
2895 out4f[x*4+1] = c[1];
2896 out4f[x*4+2] = c[2];
2897 out4f[x*4+3] = c[3];
2903 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2905 int x, startx = span->startx, endx = span->endx;
2906 float c[4], localcolor[4];
2907 localcolor[0] = subcolor[0];
2908 localcolor[1] = subcolor[1];
2909 localcolor[2] = subcolor[2];
2910 localcolor[3] = subcolor[3];
2911 for (x = startx;x < endx;x++)
2913 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2914 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2915 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2916 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2917 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2918 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2919 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2920 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2926 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2928 int x, startx = span->startx, endx = span->endx;
2929 for (x = startx;x < endx;x++)
2931 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2932 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2933 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2934 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2940 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2942 int x, startx = span->startx, endx = span->endx;
2943 for (x = startx;x < endx;x++)
2945 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2946 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2947 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2948 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2954 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2956 int x, startx = span->startx, endx = span->endx;
2958 for (x = startx;x < endx;x++)
2960 a = 1.0f - inb4f[x*4+3];
2962 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2963 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2964 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2965 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2971 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2973 int x, startx = span->startx, endx = span->endx;
2974 float localcolor[4], ilerp, lerp;
2975 localcolor[0] = color[0];
2976 localcolor[1] = color[1];
2977 localcolor[2] = color[2];
2978 localcolor[3] = color[3];
2979 ilerp = 1.0f - localcolor[3];
2980 lerp = localcolor[3];
2981 for (x = startx;x < endx;x++)
2983 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2984 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2985 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2986 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2993 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2997 int startx = span->startx;
2998 int endx = span->endx;
3001 __m128i submod, substep, endsubmod;
3002 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3003 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3004 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3005 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3006 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007 for (x = startx; x < endx;)
3009 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3010 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3011 if (nextsub >= endx)
3013 nextsub = endsub = endx-1;
3014 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3018 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3019 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3020 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3021 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3022 substep = _mm_packs_epi32(substep, substep);
3023 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3025 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3026 pix = _mm_mulhi_epu16(pix, submod);
3027 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3031 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3032 pix = _mm_mulhi_epu16(pix, submod);
3033 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3040 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3044 int startx = span->startx;
3045 int endx = span->endx;
3048 __m128i submod, substep, endsubmod;
3049 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3050 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3051 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3052 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3053 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054 for (x = startx; x < endx;)
3056 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3057 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3058 if (nextsub >= endx)
3060 nextsub = endsub = endx-1;
3061 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3065 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3066 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3067 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3068 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3069 substep = _mm_packs_epi32(substep, substep);
3070 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3072 __m128i pix = _mm_srai_epi16(submod, 4);
3073 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3077 __m128i pix = _mm_srai_epi16(submod, 4);
3078 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3085 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3088 int x, startx = span->startx, endx = span->endx;
3089 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3090 localcolor = _mm_packs_epi32(localcolor, localcolor);
3091 for (x = startx;x+2 <= endx;x+=2)
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3095 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3096 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3100 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3102 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3103 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3108 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3111 int x, startx = span->startx, endx = span->endx;
3112 for (x = startx;x+2 <= endx;x+=2)
3114 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116 pix1 = _mm_mulhi_epu16(pix1, pix2);
3117 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3121 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123 pix1 = _mm_mulhi_epu16(pix1, pix2);
3124 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3129 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 int x, startx = span->startx, endx = span->endx;
3133 for (x = startx;x+2 <= endx;x+=2)
3135 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137 pix1 = _mm_add_epi16(pix1, pix2);
3138 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3142 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3143 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3144 pix1 = _mm_add_epi16(pix1, pix2);
3145 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3151 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3154 int x, startx = span->startx, endx = span->endx;
3155 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3156 tint = _mm_packs_epi32(tint, tint);
3157 for (x = startx;x+2 <= endx;x+=2)
3159 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3160 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3161 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3162 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3166 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3167 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3168 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3169 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3175 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3178 int x, startx = span->startx, endx = span->endx;
3179 for (x = startx;x+2 <= endx;x+=2)
3181 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3182 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3183 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3184 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3185 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3189 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3190 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3191 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3192 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3193 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3198 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3201 int x, startx = span->startx, endx = span->endx;
3202 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3203 localcolor = _mm_packs_epi32(localcolor, localcolor);
3204 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3205 for (x = startx;x+2 <= endx;x+=2)
3207 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3208 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3209 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3213 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3214 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3215 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3222 static void DPSOFTRAST_VertexShader_Generic(void)
3224 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3225 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3226 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3227 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3228 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3231 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3234 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3238 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3240 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3241 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3242 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3244 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3245 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3248 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3250 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3253 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3255 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3258 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3263 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3264 if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3267 for (x = span->startx;x < span->endx;x++)
3268 buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3270 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3275 static void DPSOFTRAST_VertexShader_PostProcess(void)
3277 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3278 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3279 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3282 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3284 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3285 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3286 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3287 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3288 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3289 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3290 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3292 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3293 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3295 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3296 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3298 // TODO: implement saturation
3300 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3302 // TODO: implement gammaramps
3304 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3309 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3311 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3314 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3316 // this is never called (because colormask is off when this shader is used)
3317 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3318 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3319 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3320 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3321 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3326 static void DPSOFTRAST_VertexShader_FlatColor(void)
3328 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3329 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3332 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3335 unsigned char * RESTRICT pixelmask = span->pixelmask;
3336 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3337 int x, startx = span->startx, endx = span->endx;
3338 __m128i Color_Ambientm;
3339 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3340 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3341 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3342 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3343 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3344 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3345 pixel = buffer_FragColorbgra8;
3346 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3347 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3348 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3349 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3350 for (x = startx;x < endx;x++)
3353 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3356 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3357 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3358 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3359 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3365 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3366 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3367 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3369 if (pixel == buffer_FragColorbgra8)
3370 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3376 static void DPSOFTRAST_VertexShader_VertexColor(void)
3378 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3379 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3380 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3383 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3386 unsigned char * RESTRICT pixelmask = span->pixelmask;
3387 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3388 int x, startx = span->startx, endx = span->endx;
3389 __m128i Color_Ambientm, Color_Diffusem;
3391 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3392 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3393 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3395 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3396 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3397 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3398 pixel = buffer_FragColorbgra8;
3399 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3400 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3401 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3402 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3403 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3404 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3405 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3406 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3407 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3408 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3409 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3410 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3411 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3412 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3414 __m128i color, mod, pix;
3415 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3418 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3419 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3420 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3421 data = _mm_add_ps(data, slope);
3422 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3423 data = _mm_add_ps(data, slope);
3424 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3425 data = _mm_add_ps(data, slope);
3426 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3427 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3428 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3429 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3430 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3431 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3437 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3438 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3439 mod = _mm_packs_epi32(mod, mod);
3440 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3441 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3443 if (pixel == buffer_FragColorbgra8)
3444 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3450 static void DPSOFTRAST_VertexShader_Lightmap(void)
3452 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3453 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3454 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3457 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3460 unsigned char * RESTRICT pixelmask = span->pixelmask;
3461 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3462 int x, startx = span->startx, endx = span->endx;
3463 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3464 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3465 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3466 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3467 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3468 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3469 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3470 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3471 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3472 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3473 pixel = buffer_FragColorbgra8;
3474 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3475 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3476 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3477 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3478 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3479 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3480 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3481 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3483 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3484 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3485 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3486 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3487 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3488 for (x = startx;x < endx;x++)
3490 __m128i color, lightmap, glow, pix;
3491 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3494 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3495 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3496 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3497 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3498 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3499 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3500 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3501 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3502 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3503 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3509 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3510 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3511 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3512 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3513 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3514 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3519 for (x = startx;x < endx;x++)
3521 __m128i color, lightmap, pix;
3522 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3525 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3526 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3527 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3528 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3529 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3530 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3531 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3537 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3538 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3539 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3540 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3543 if (pixel == buffer_FragColorbgra8)
3544 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3549 void DPSOFTRAST_VertexShader_LightDirection(void);
3550 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3552 static void DPSOFTRAST_VertexShader_FakeLight(void)
3554 DPSOFTRAST_VertexShader_LightDirection();
3557 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3559 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3564 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3566 DPSOFTRAST_VertexShader_LightDirection();
3567 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3570 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3572 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3577 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3579 DPSOFTRAST_VertexShader_LightDirection();
3580 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3583 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3585 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3590 void DPSOFTRAST_VertexShader_LightDirection(void)
3593 int numvertices = dpsoftrast.numvertices;
3595 float LightVector[4];
3596 float EyePosition[4];
3597 float EyeVectorModelSpace[4];
3603 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3604 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3605 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3606 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3607 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3608 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3609 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3610 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3611 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3612 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3613 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3614 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3615 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3616 for (i = 0;i < numvertices;i++)
3618 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3619 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3620 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3621 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3622 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3623 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3624 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3625 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3626 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3627 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3628 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3629 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3630 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3631 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3632 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3633 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3634 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3635 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3636 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3637 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3638 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3639 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3640 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3641 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3642 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3643 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3644 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3645 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3646 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3648 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3651 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3652 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3653 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3654 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3655 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3656 #define DPSOFTRAST_Vector3Normalize(v)\
3659 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3670 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3672 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3673 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3680 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3681 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3682 int x, startx = span->startx, endx = span->endx;
3683 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3684 float LightVectordata[4];
3685 float LightVectorslope[4];
3686 float EyeVectordata[4];
3687 float EyeVectorslope[4];
3688 float VectorSdata[4];
3689 float VectorSslope[4];
3690 float VectorTdata[4];
3691 float VectorTslope[4];
3692 float VectorRdata[4];
3693 float VectorRslope[4];
3695 float diffusetex[4];
3697 float surfacenormal[4];
3698 float lightnormal[4];
3699 float lightnormal_modelspace[4];
3701 float specularnormal[4];
3704 float SpecularPower;
3706 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3707 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3708 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3709 Color_Glow[3] = 0.0f;
3710 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3711 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3712 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3713 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3714 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3715 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3716 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3717 Color_Pants[3] = 0.0f;
3718 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3719 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3720 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3721 Color_Shirt[3] = 0.0f;
3722 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3726 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3727 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3731 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3733 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3735 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3736 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3737 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3738 Color_Diffuse[3] = 0.0f;
3739 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3740 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3741 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3742 LightColor[3] = 0.0f;
3743 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3744 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3745 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3746 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3747 Color_Specular[3] = 0.0f;
3748 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3749 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3750 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3752 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3754 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3755 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3756 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3757 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3758 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3762 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3763 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3765 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3767 // nothing of this needed
3771 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3774 for (x = startx;x < endx;x++)
3777 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3778 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3779 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3780 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3781 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3783 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3784 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3785 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3786 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3788 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3789 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3790 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3791 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3792 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3795 DPSOFTRAST_Vector3Normalize(surfacenormal);
3797 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3799 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3800 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3801 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3802 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3804 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3805 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3806 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3807 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3809 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3810 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3811 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3812 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3814 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3815 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3816 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3817 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3819 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3820 DPSOFTRAST_Vector3Normalize(lightnormal);
3822 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3824 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3825 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3826 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3827 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3830 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3832 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3833 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3834 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3836 float f = 1.0f / 256.0f;
3837 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3838 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3839 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3842 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3844 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3845 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3846 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3847 DPSOFTRAST_Vector3Normalize(lightnormal);
3849 LightColor[0] = 1.0;
3850 LightColor[1] = 1.0;
3851 LightColor[2] = 1.0;
3855 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3856 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3857 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3858 DPSOFTRAST_Vector3Normalize(lightnormal);
3861 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3863 if(thread->shader_exactspecularmath)
3865 // reflect lightnormal at surfacenormal, take the negative of that
3866 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3868 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3869 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3870 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3871 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3873 // dot of this and normalize(EyeVectorFogDepth.xyz)
3874 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3875 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3876 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3877 DPSOFTRAST_Vector3Normalize(eyenormal);
3879 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3883 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3884 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3885 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3886 DPSOFTRAST_Vector3Normalize(eyenormal);
3888 specularnormal[0] = lightnormal[0] + eyenormal[0];
3889 specularnormal[1] = lightnormal[1] + eyenormal[1];
3890 specularnormal[2] = lightnormal[2] + eyenormal[2];
3891 DPSOFTRAST_Vector3Normalize(specularnormal);
3893 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3895 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3897 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3899 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3900 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3901 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3902 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3906 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3907 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3908 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3909 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3912 buffer_FragColorbgra8[x*4+0] = d[0];
3913 buffer_FragColorbgra8[x*4+1] = d[1];
3914 buffer_FragColorbgra8[x*4+2] = d[2];
3915 buffer_FragColorbgra8[x*4+3] = d[3];
3918 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3920 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3921 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3922 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3923 Color_Diffuse[3] = 0.0f;
3924 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3925 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3926 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3927 LightColor[3] = 0.0f;
3928 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3930 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3932 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3933 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3934 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3935 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3936 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3940 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3941 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3943 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3945 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3949 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3952 for (x = startx;x < endx;x++)
3955 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3956 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3957 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3958 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3959 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3962 DPSOFTRAST_Vector3Normalize(surfacenormal);
3964 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3966 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3967 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3968 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3969 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3971 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3972 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3973 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3974 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3976 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3977 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3978 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3979 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3981 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3982 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3983 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3984 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3986 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3987 DPSOFTRAST_Vector3Normalize(lightnormal);
3989 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3991 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3992 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3993 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3994 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3997 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3999 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4000 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4001 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4003 float f = 1.0f / 256.0f;
4004 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4005 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4006 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4009 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4011 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4012 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4013 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4014 DPSOFTRAST_Vector3Normalize(lightnormal);
4016 LightColor[0] = 1.0;
4017 LightColor[1] = 1.0;
4018 LightColor[2] = 1.0;
4022 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4023 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4024 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4025 DPSOFTRAST_Vector3Normalize(lightnormal);
4028 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4029 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4031 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4032 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4033 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4034 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4038 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4039 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4040 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4041 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4043 buffer_FragColorbgra8[x*4+0] = d[0];
4044 buffer_FragColorbgra8[x*4+1] = d[1];
4045 buffer_FragColorbgra8[x*4+2] = d[2];
4046 buffer_FragColorbgra8[x*4+3] = d[3];
4051 for (x = startx;x < endx;x++)
4054 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4055 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4056 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4057 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4059 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4061 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4062 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4063 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4064 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4068 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4069 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4070 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4071 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4073 buffer_FragColorbgra8[x*4+0] = d[0];
4074 buffer_FragColorbgra8[x*4+1] = d[1];
4075 buffer_FragColorbgra8[x*4+2] = d[2];
4076 buffer_FragColorbgra8[x*4+3] = d[3];
4079 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4084 static void DPSOFTRAST_VertexShader_LightSource(void)
4087 int numvertices = dpsoftrast.numvertices;
4088 float LightPosition[4];
4089 float LightVector[4];
4090 float LightVectorModelSpace[4];
4091 float EyePosition[4];
4092 float EyeVectorModelSpace[4];
4098 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4099 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4100 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4101 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4102 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4103 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4104 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4105 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4106 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4107 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4108 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4109 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4110 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4111 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4112 for (i = 0;i < numvertices;i++)
4114 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4115 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4116 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4117 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4118 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4119 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4120 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4121 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4122 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4123 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4124 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4125 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4126 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4127 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4128 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4129 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4130 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4131 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4132 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4133 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4134 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4135 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4136 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4137 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4138 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4139 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4140 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4141 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4142 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4143 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4144 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4145 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4147 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4148 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4151 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4154 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4155 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4160 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4162 int x, startx = span->startx, endx = span->endx;
4163 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4164 float CubeVectordata[4];
4165 float CubeVectorslope[4];
4166 float LightVectordata[4];
4167 float LightVectorslope[4];
4168 float EyeVectordata[4];
4169 float EyeVectorslope[4];
4171 float diffusetex[4];
4173 float surfacenormal[4];
4174 float lightnormal[4];
4176 float specularnormal[4];
4179 float SpecularPower;
4180 float CubeVector[4];
4183 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4184 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4185 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4186 Color_Glow[3] = 0.0f;
4187 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4188 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4189 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4190 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4191 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4192 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4193 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4194 Color_Diffuse[3] = 0.0f;
4195 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4196 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4197 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4198 Color_Specular[3] = 0.0f;
4199 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4200 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4201 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4202 Color_Pants[3] = 0.0f;
4203 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4204 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4205 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4206 Color_Shirt[3] = 0.0f;
4207 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4208 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4209 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4210 LightColor[3] = 0.0f;
4211 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4212 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4213 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4214 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4215 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4216 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4217 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4220 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4221 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4223 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4224 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4225 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4227 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4228 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4229 for (x = startx;x < endx;x++)
4232 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4233 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4234 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4235 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4236 if (attenuation < 0.01f)
4238 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4240 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4241 if (attenuation < 0.01f)
4245 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4246 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4247 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4248 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4249 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4251 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4252 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4253 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4254 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4256 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4257 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4258 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4259 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4260 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4261 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4262 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4263 DPSOFTRAST_Vector3Normalize(surfacenormal);
4265 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4266 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4267 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4268 DPSOFTRAST_Vector3Normalize(lightnormal);
4270 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4272 if(thread->shader_exactspecularmath)
4274 // reflect lightnormal at surfacenormal, take the negative of that
4275 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4277 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4278 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4279 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4280 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4282 // dot of this and normalize(EyeVectorFogDepth.xyz)
4283 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4284 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4285 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4286 DPSOFTRAST_Vector3Normalize(eyenormal);
4288 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4292 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4293 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4294 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4295 DPSOFTRAST_Vector3Normalize(eyenormal);
4297 specularnormal[0] = lightnormal[0] + eyenormal[0];
4298 specularnormal[1] = lightnormal[1] + eyenormal[1];
4299 specularnormal[2] = lightnormal[2] + eyenormal[2];
4300 DPSOFTRAST_Vector3Normalize(specularnormal);
4302 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4304 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4306 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4308 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4309 attenuation *= (1.0f / 255.0f);
4310 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4311 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4312 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4313 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4317 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4318 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4319 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4320 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4322 buffer_FragColorbgra8[x*4+0] = d[0];
4323 buffer_FragColorbgra8[x*4+1] = d[1];
4324 buffer_FragColorbgra8[x*4+2] = d[2];
4325 buffer_FragColorbgra8[x*4+3] = d[3];
4328 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4330 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4331 for (x = startx;x < endx;x++)
4334 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4335 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4336 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4337 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4338 if (attenuation < 0.01f)
4340 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4342 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4343 if (attenuation < 0.01f)
4347 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4348 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4349 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4350 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4351 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4353 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4354 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4355 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4356 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4358 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4359 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4360 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4361 DPSOFTRAST_Vector3Normalize(surfacenormal);
4363 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4364 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4365 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4366 DPSOFTRAST_Vector3Normalize(lightnormal);
4368 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4369 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4371 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4372 attenuation *= (1.0f / 255.0f);
4373 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4374 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4375 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4376 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4380 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4381 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4382 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4383 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4385 buffer_FragColorbgra8[x*4+0] = d[0];
4386 buffer_FragColorbgra8[x*4+1] = d[1];
4387 buffer_FragColorbgra8[x*4+2] = d[2];
4388 buffer_FragColorbgra8[x*4+3] = d[3];
4393 for (x = startx;x < endx;x++)
4396 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4397 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4398 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4399 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4400 if (attenuation < 0.01f)
4402 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4404 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4405 if (attenuation < 0.01f)
4409 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4410 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4411 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4412 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4413 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4415 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4416 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4417 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4418 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4420 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4422 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4423 attenuation *= (1.0f / 255.0f);
4424 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4425 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4426 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4427 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4431 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4432 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4433 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4434 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4436 buffer_FragColorbgra8[x*4+0] = d[0];
4437 buffer_FragColorbgra8[x*4+1] = d[1];
4438 buffer_FragColorbgra8[x*4+2] = d[2];
4439 buffer_FragColorbgra8[x*4+3] = d[3];
4442 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4448 static void DPSOFTRAST_VertexShader_Refraction(void)
4450 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4451 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4452 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4455 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4457 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4459 int x, startx = span->startx, endx = span->endx;
4462 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4463 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4466 float ModelViewProjectionPositiondata[4];
4467 float ModelViewProjectionPositionslope[4];
4470 float ScreenScaleRefractReflect[2];
4471 float ScreenCenterRefractReflect[2];
4472 float DistortScaleRefractReflect[2];
4473 float RefractColor[4];
4475 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4476 if(!texture) return;
4479 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4480 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4483 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4486 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4487 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4488 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4489 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4490 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4491 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4492 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4493 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4494 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4495 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4498 for (x = startx;x < endx;x++)
4500 float SafeScreenTexCoord[2];
4501 float ScreenTexCoord[2];
4508 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4509 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4511 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4512 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4513 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4515 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4516 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4517 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4518 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4519 DPSOFTRAST_Vector3Normalize(v);
4520 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4521 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4523 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4524 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4526 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4527 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4528 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4529 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4532 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4537 static void DPSOFTRAST_VertexShader_Water(void)
4540 int numvertices = dpsoftrast.numvertices;
4541 float EyePosition[4];
4542 float EyeVectorModelSpace[4];
4548 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4549 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4550 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4551 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4552 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4553 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4554 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4555 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4556 for (i = 0;i < numvertices;i++)
4558 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4559 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4560 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4561 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4562 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4563 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4564 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4565 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4566 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4567 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4568 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4569 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4570 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4571 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4572 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4573 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4574 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4575 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4576 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4577 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4578 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4579 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4581 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4582 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4583 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4587 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4589 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4591 int x, startx = span->startx, endx = span->endx;
4594 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4595 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4598 float ModelViewProjectionPositiondata[4];
4599 float ModelViewProjectionPositionslope[4];
4600 float EyeVectordata[4];
4601 float EyeVectorslope[4];
4604 float ScreenScaleRefractReflect[4];
4605 float ScreenCenterRefractReflect[4];
4606 float DistortScaleRefractReflect[4];
4607 float RefractColor[4];
4608 float ReflectColor[4];
4609 float ReflectFactor;
4610 float ReflectOffset;
4612 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4613 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4614 if(!texture_refraction || !texture_reflection) return;
4617 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4618 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4621 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4622 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4625 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4626 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4627 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4628 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4629 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4630 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4631 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4632 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4633 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4634 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4635 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4636 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4637 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4638 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4639 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4640 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4641 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4642 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4643 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4644 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4645 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4646 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4649 for (x = startx;x < endx;x++)
4651 float SafeScreenTexCoord[4];
4652 float ScreenTexCoord[4];
4655 unsigned char c1[4];
4656 unsigned char c2[4];
4661 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4662 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4664 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4665 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4666 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4667 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4668 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4670 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4671 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4672 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4673 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4674 DPSOFTRAST_Vector3Normalize(v);
4675 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4676 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4677 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4678 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4680 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4681 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4682 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4683 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4684 DPSOFTRAST_Vector3Normalize(v);
4685 Fresnel = 1.0f - v[2];
4686 Fresnel = min(1.0f, Fresnel);
4687 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4689 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4690 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4691 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4692 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4694 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4695 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4696 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4697 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4700 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4705 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4707 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4710 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4713 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4714 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4715 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4716 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4717 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4722 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4724 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4727 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4730 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4731 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4732 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4733 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4734 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4739 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4741 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4744 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4747 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4748 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4749 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4750 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4751 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4756 typedef struct DPSOFTRAST_ShaderModeInfo_s
4759 void (*Vertex)(void);
4760 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4761 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4762 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4764 DPSOFTRAST_ShaderModeInfo;
4766 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4768 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4769 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4770 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4771 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4772 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4773 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4774 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4775 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4776 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4777 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4778 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4779 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4780 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4781 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4782 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4783 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4784 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4785 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4788 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4793 unsigned int *depthpixel;
4797 unsigned char *pixelmask;
4798 DPSOFTRAST_State_Triangle *triangle;
4799 triangle = &thread->triangles[span->triangle];
4800 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4801 startx = span->startx;
4803 depth = span->depthbase;
4804 depthslope = span->depthslope;
4805 pixelmask = thread->pixelmaskarray;
4806 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4808 switch(thread->fb_depthfunc)
4811 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4812 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4813 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4814 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4815 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4816 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4817 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4819 while (startx < endx && !pixelmask[startx])
4821 while (endx > startx && !pixelmask[endx-1])
4826 // no depth testing means we're just dealing with color...
4827 memset(pixelmask + startx, 1, endx - startx);
4829 span->pixelmask = pixelmask;
4830 span->startx = startx;
4834 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4836 int x, d, depth, depthslope, startx, endx;
4837 const unsigned char *pixelmask;
4838 unsigned int *depthpixel;
4839 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4841 depth = span->depthbase;
4842 depthslope = span->depthslope;
4843 pixelmask = span->pixelmask;
4844 startx = span->startx;
4846 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4847 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4853 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4856 DPSOFTRAST_State_Triangle *triangle;
4857 DPSOFTRAST_State_Span *span;
4858 for (i = 0; i < thread->numspans; i++)
4860 span = &thread->spans[i];
4861 triangle = &thread->triangles[span->triangle];
4862 DPSOFTRAST_Draw_DepthTest(thread, span);
4863 if (span->startx >= span->endx)
4865 // run pixel shader if appropriate
4866 // do this before running depthmask code, to allow the pixelshader
4867 // to clear pixelmask values for alpha testing
4868 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4869 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4870 DPSOFTRAST_Draw_DepthWrite(thread, span);
4872 thread->numspans = 0;
4875 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4877 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4880 int cullface = thread->cullface;
4881 int minx, maxx, miny, maxy;
4882 int miny1, maxy1, miny2, maxy2;
4883 __m128i fbmin, fbmax;
4884 __m128 viewportcenter, viewportscale;
4885 int firstvertex = command->firstvertex;
4886 int numvertices = command->numvertices;
4887 int numtriangles = command->numtriangles;
4888 const int *element3i = command->element3i;
4889 const unsigned short *element3s = command->element3s;
4890 int clipped = command->clipped;
4897 int starty, endy, bandy;
4901 float clip0origin, clip0slope;
4903 __m128 triangleedge1, triangleedge2, trianglenormal;
4906 DPSOFTRAST_State_Triangle *triangle;
4907 DPSOFTRAST_Texture *texture;
4908 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4909 miny = thread->fb_scissor[1];
4910 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4911 miny1 = bound(miny, thread->miny1, maxy);
4912 maxy1 = bound(miny, thread->maxy1, maxy);
4913 miny2 = bound(miny, thread->miny2, maxy);
4914 maxy2 = bound(miny, thread->maxy2, maxy);
4915 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4917 if (!ATOMIC_DECREMENT(command->refcount))
4919 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4920 MM_FREE(command->arrays);
4924 minx = thread->fb_scissor[0];
4925 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4926 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4927 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4928 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4929 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4930 screen[3] = _mm_setzero_ps();
4931 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4932 for (i = 0;i < numtriangles;i++)
4934 const float *screencoord4f = command->arrays;
4935 const float *arrays = screencoord4f + numvertices*4;
4937 // generate the 3 edges of this triangle
4938 // generate spans for the triangle - switch based on left split or right split classification of triangle
4941 e[0] = element3s[i*3+0] - firstvertex;
4942 e[1] = element3s[i*3+1] - firstvertex;
4943 e[2] = element3s[i*3+2] - firstvertex;
4947 e[0] = element3i[i*3+0] - firstvertex;
4948 e[1] = element3i[i*3+1] - firstvertex;
4949 e[2] = element3i[i*3+2] - firstvertex;
4958 #define SKIPBACKFACE \
4959 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4960 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4961 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4962 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4963 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4967 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4971 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4976 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4977 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4979 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4980 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4982 #define CLIPPEDVERTEXCOPY(k,p1) \
4983 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4985 #define GENATTRIBCOPY(attrib, p1) \
4986 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4987 #define GENATTRIBLERP(attrib, p1, p2) \
4989 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4990 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4992 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4996 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4997 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4998 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4999 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
5000 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
5001 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5002 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5008 // calculate distance from nearplane
5009 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5010 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5011 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5012 if (clipdist[0] >= 0.0f)
5014 if (clipdist[1] >= 0.0f)
5016 if (clipdist[2] >= 0.0f)
5019 // triangle is entirely in front of nearplane
5020 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5027 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5035 if (clipdist[2] >= 0.0f)
5037 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5044 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5051 else if (clipdist[1] >= 0.0f)
5053 if (clipdist[2] >= 0.0f)
5055 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5062 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5068 else if (clipdist[2] >= 0.0f)
5070 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5075 else continue; // triangle is entirely behind nearplane
5078 // calculate integer y coords for triangle points
5079 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5080 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5081 screenmin = _mm_min_epi16(screeni, screenir),
5082 screenmax = _mm_max_epi16(screeni, screenir);
5083 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5084 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5085 screenmin = _mm_max_epi16(screenmin, fbmin);
5086 screenmax = _mm_min_epi16(screenmax, fbmax);
5087 // skip offscreen triangles
5088 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5090 starty = _mm_extract_epi16(screenmin, 1);
5091 endy = _mm_extract_epi16(screenmax, 1)+1;
5092 if (starty >= maxy1 && endy <= miny2)
5094 screeny = _mm_srai_epi32(screeni, 16);
5097 triangle = &thread->triangles[thread->numtriangles];
5099 // calculate attribute plans for triangle data...
5100 // okay, this triangle is going to produce spans, we'd better project
5101 // the interpolants now (this is what gives perspective texturing),
5102 // this consists of simply multiplying all arrays by the W coord
5103 // (which is basically 1/Z), which will be undone per-pixel
5104 // (multiplying by Z again) to get the perspective-correct array
5107 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5108 __m128 mipedgescale, mipdensity;
5109 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5110 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5111 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5112 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5113 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5114 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5115 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5116 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5117 attribedge1 = _mm_sub_ss(w0, w1);
5118 attribedge2 = _mm_sub_ss(w2, w1);
5119 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5120 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5121 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5122 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5123 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5124 _mm_store_ss(&triangle->w[0], attribxslope);
5125 _mm_store_ss(&triangle->w[1], attribyslope);
5126 _mm_store_ss(&triangle->w[2], attriborigin);
5131 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5133 float cliporigin, clipxslope, clipyslope;
5134 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5135 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5136 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5137 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5138 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5139 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5140 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5141 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5142 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5145 clip0origin = -cliporigin/clipxslope;
5146 clip0slope = -clipyslope/clipxslope;
5147 clip0dir = clipxslope > 0 ? 1 : -1;
5149 else if(clipyslope > 0)
5151 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5152 clip0slope = dpsoftrast.fb_width;
5155 else if(clipyslope < 0)
5157 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5158 clip0slope = -dpsoftrast.fb_width;
5161 else if(clip0origin < 0) continue;
5164 mipedgescale = _mm_setzero_ps();
5165 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5167 __m128 attrib0, attrib1, attrib2;
5168 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5169 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5171 arrays += numvertices*4;
5172 GENATTRIBS(attrib0, attrib1, attrib2);
5173 attriborigin = _mm_mul_ps(attrib1, w1);
5174 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5175 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5176 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5177 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5178 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5179 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5180 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5181 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5182 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5184 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5185 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5186 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5187 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5191 memset(triangle->mip, 0, sizeof(triangle->mip));
5192 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5194 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5195 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5197 texture = thread->texbound[texunit];
5198 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5200 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5201 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5202 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5203 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5204 // this will be multiplied in the texturing routine by the texture resolution
5205 y = _mm_cvtss_si32(mipdensity);
5208 y = (int)(log((float)y)*0.5f/M_LN2);
5209 if (y > texture->mipmaps - 1)
5210 y = texture->mipmaps - 1;
5211 triangle->mip[texunit] = y;
5217 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5220 __m128 xcoords, xslope;
5221 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5222 int yccmask = _mm_movemask_epi8(ycc);
5223 int edge0p, edge0n, edge1p, edge1n;
5232 case 0xFFFF: /*0000*/ y = endy; continue;
5233 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5234 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5235 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5236 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5237 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5238 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5239 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5240 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5241 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5242 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5243 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5244 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5245 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5247 case 0x0000: /*1111*/ y++; continue;
5255 case 0xFFFF: /*000*/ y = endy; continue;
5256 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5257 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5258 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5259 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5260 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5261 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5262 case 0x0000: /*111*/ y++; continue;
5265 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5266 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5267 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5268 nexty = _mm_extract_epi16(ycc, 0);
5269 if (nexty >= bandy) nexty = bandy-1;
5270 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5271 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5272 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5273 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5274 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5275 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5277 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5278 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5280 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5281 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5283 int startx, endx, offset;
5284 startx = _mm_cvtss_si32(xcoords);
5285 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5286 if (startx < minx) startx = minx;
5287 if (endx > maxx) endx = maxx;
5288 if (startx >= endx) continue;
5296 if(endx <= clip0) continue;
5297 startx = (int)clip0;
5300 else if (endx > clip0)
5302 if(startx >= clip0) continue;
5307 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5309 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5310 span->triangle = thread->numtriangles;
5314 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5315 if (span->startx >= span->endx)
5317 wslope = triangle->w[0];
5318 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5319 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5320 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5321 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5322 DPSOFTRAST_Draw_ProcessSpans(thread);
5327 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5329 DPSOFTRAST_Draw_ProcessSpans(thread);
5330 thread->numtriangles = 0;
5334 if (!ATOMIC_DECREMENT(command->refcount))
5336 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5337 MM_FREE(command->arrays);
5340 if (thread->numspans > 0 || thread->numtriangles > 0)
5342 DPSOFTRAST_Draw_ProcessSpans(thread);
5343 thread->numtriangles = 0;
5348 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5352 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5353 int datasize = 2*numvertices*sizeof(float[4]);
5354 DPSOFTRAST_Command_Draw *command;
5355 unsigned char *data;
5356 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5358 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5359 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5361 datasize += numvertices*sizeof(float[4]);
5364 datasize += numtriangles*sizeof(unsigned short[3]);
5366 datasize += numtriangles*sizeof(int[3]);
5367 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5368 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5370 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5371 data = (unsigned char *)MM_CALLOC(datasize, 1);
5375 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5376 data = (unsigned char *)command + commandsize;
5378 command->firstvertex = firstvertex;
5379 command->numvertices = numvertices;
5380 command->numtriangles = numtriangles;
5381 command->arrays = (float *)data;
5382 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5383 dpsoftrast.firstvertex = firstvertex;
5384 dpsoftrast.numvertices = numvertices;
5385 dpsoftrast.screencoord4f = (float *)data;
5386 data += numvertices*sizeof(float[4]);
5387 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5388 data += numvertices*sizeof(float[4]);
5389 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5391 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5392 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5394 dpsoftrast.post_array4f[j] = (float *)data;
5395 data += numvertices*sizeof(float[4]);
5397 command->element3i = NULL;
5398 command->element3s = NULL;
5401 command->element3s = (unsigned short *)data;
5402 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5406 command->element3i = (int *)data;
5407 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5412 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5414 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5415 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5416 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5417 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5418 if (command->starty >= command->endy)
5420 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5421 MM_FREE(command->arrays);
5422 DPSOFTRAST_UndoCommand(command->commandsize);
5425 command->clipped = dpsoftrast.drawclipped;
5426 command->refcount = dpsoftrast.numthreads;
5428 if (dpsoftrast.usethreads)
5431 DPSOFTRAST_Draw_SyncCommands();
5432 for (i = 0; i < dpsoftrast.numthreads; i++)
5434 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5435 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5436 Thread_CondSignal(thread->drawcond);
5441 DPSOFTRAST_Draw_FlushThreads();
5445 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5446 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5448 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5450 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5452 DPSOFTRAST_Command_SetRenderTargets *command;
5453 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5454 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5455 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5457 dpsoftrast.fb_width = width;
5458 dpsoftrast.fb_height = height;
5459 dpsoftrast.fb_depthpixels = depthpixels;
5460 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5461 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5462 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5463 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5464 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5465 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5466 command->width = width;
5467 command->height = height;
5470 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5472 int commandoffset = thread->commandoffset;
5473 while (commandoffset != endoffset)
5475 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5476 switch (command->opcode)
5478 #define INTERPCOMMAND(name) \
5479 case DPSOFTRAST_OPCODE_##name : \
5480 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5481 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5482 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5483 commandoffset = 0; \
5485 INTERPCOMMAND(Viewport)
5486 INTERPCOMMAND(ClearColor)
5487 INTERPCOMMAND(ClearDepth)
5488 INTERPCOMMAND(ColorMask)
5489 INTERPCOMMAND(DepthTest)
5490 INTERPCOMMAND(ScissorTest)
5491 INTERPCOMMAND(Scissor)
5492 INTERPCOMMAND(BlendFunc)
5493 INTERPCOMMAND(BlendSubtract)
5494 INTERPCOMMAND(DepthMask)
5495 INTERPCOMMAND(DepthFunc)
5496 INTERPCOMMAND(DepthRange)
5497 INTERPCOMMAND(PolygonOffset)
5498 INTERPCOMMAND(CullFace)
5499 INTERPCOMMAND(SetTexture)
5500 INTERPCOMMAND(SetShader)
5501 INTERPCOMMAND(Uniform4f)
5502 INTERPCOMMAND(UniformMatrix4f)
5503 INTERPCOMMAND(Uniform1i)
5504 INTERPCOMMAND(SetRenderTargets)
5505 INTERPCOMMAND(ClipPlane)
5507 case DPSOFTRAST_OPCODE_Draw:
5508 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5509 commandoffset += command->commandsize;
5510 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5512 thread->commandoffset = commandoffset;
5515 case DPSOFTRAST_OPCODE_Reset:
5520 thread->commandoffset = commandoffset;
5523 static int DPSOFTRAST_Draw_Thread(void *data)
5525 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5526 while(thread->index >= 0)
5528 if (thread->commandoffset != dpsoftrast.drawcommand)
5530 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5534 Thread_LockMutex(thread->drawmutex);
5535 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5537 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5538 thread->starving = true;
5539 Thread_CondWait(thread->drawcond, thread->drawmutex);
5540 thread->starving = false;
5542 Thread_UnlockMutex(thread->drawmutex);
5548 static void DPSOFTRAST_Draw_FlushThreads(void)
5550 DPSOFTRAST_State_Thread *thread;
5552 DPSOFTRAST_Draw_SyncCommands();
5553 if (dpsoftrast.usethreads)
5555 for (i = 0; i < dpsoftrast.numthreads; i++)
5557 thread = &dpsoftrast.threads[i];
5558 if (thread->commandoffset != dpsoftrast.drawcommand)
5560 Thread_LockMutex(thread->drawmutex);
5561 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5562 Thread_CondSignal(thread->drawcond);
5563 Thread_UnlockMutex(thread->drawmutex);
5566 for (i = 0; i < dpsoftrast.numthreads; i++)
5568 thread = &dpsoftrast.threads[i];
5569 if (thread->commandoffset != dpsoftrast.drawcommand)
5571 Thread_LockMutex(thread->drawmutex);
5572 if (thread->commandoffset != dpsoftrast.drawcommand)
5574 thread->waiting = true;
5575 Thread_CondWait(thread->waitcond, thread->drawmutex);
5576 thread->waiting = false;
5578 Thread_UnlockMutex(thread->drawmutex);
5584 for (i = 0; i < dpsoftrast.numthreads; i++)
5586 thread = &dpsoftrast.threads[i];
5587 if (thread->commandoffset != dpsoftrast.drawcommand)
5588 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5591 dpsoftrast.commandpool.usedcommands = 0;
5594 void DPSOFTRAST_Flush(void)
5596 DPSOFTRAST_Draw_FlushThreads();
5599 void DPSOFTRAST_Finish(void)
5604 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5614 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5615 dpsoftrast.bigendian = u.b[3];
5616 dpsoftrast.fb_width = width;
5617 dpsoftrast.fb_height = height;
5618 dpsoftrast.fb_depthpixels = depthpixels;
5619 dpsoftrast.fb_colorpixels[0] = colorpixels;
5620 dpsoftrast.fb_colorpixels[1] = NULL;
5621 dpsoftrast.fb_colorpixels[1] = NULL;
5622 dpsoftrast.fb_colorpixels[1] = NULL;
5623 dpsoftrast.viewport[0] = 0;
5624 dpsoftrast.viewport[1] = 0;
5625 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5626 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5627 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5628 dpsoftrast.texture_firstfree = 1;
5629 dpsoftrast.texture_end = 1;
5630 dpsoftrast.texture_max = 0;
5631 dpsoftrast.color[0] = 1;
5632 dpsoftrast.color[1] = 1;
5633 dpsoftrast.color[2] = 1;
5634 dpsoftrast.color[3] = 1;
5635 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5636 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5637 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5638 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5639 for (i = 0; i < dpsoftrast.numthreads; i++)
5641 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5643 thread->cullface = GL_BACK;
5644 thread->colormask[0] = 1;
5645 thread->colormask[1] = 1;
5646 thread->colormask[2] = 1;
5647 thread->colormask[3] = 1;
5648 thread->blendfunc[0] = GL_ONE;
5649 thread->blendfunc[1] = GL_ZERO;
5650 thread->depthmask = true;
5651 thread->depthtest = true;
5652 thread->depthfunc = GL_LEQUAL;
5653 thread->scissortest = false;
5654 thread->viewport[0] = 0;
5655 thread->viewport[1] = 0;
5656 thread->viewport[2] = dpsoftrast.fb_width;
5657 thread->viewport[3] = dpsoftrast.fb_height;
5658 thread->scissor[0] = 0;
5659 thread->scissor[1] = 0;
5660 thread->scissor[2] = dpsoftrast.fb_width;
5661 thread->scissor[3] = dpsoftrast.fb_height;
5662 thread->depthrange[0] = 0;
5663 thread->depthrange[1] = 1;
5664 thread->polygonoffset[0] = 0;
5665 thread->polygonoffset[1] = 0;
5666 thread->clipplane[0] = 0;
5667 thread->clipplane[1] = 0;
5668 thread->clipplane[2] = 0;
5669 thread->clipplane[3] = 1;
5671 thread->numspans = 0;
5672 thread->numtriangles = 0;
5673 thread->commandoffset = 0;
5674 thread->waiting = false;
5675 thread->starving = false;
5677 thread->validate = -1;
5678 DPSOFTRAST_Validate(thread, -1);
5680 if (dpsoftrast.usethreads)
5682 thread->waitcond = Thread_CreateCond();
5683 thread->drawcond = Thread_CreateCond();
5684 thread->drawmutex = Thread_CreateMutex();
5685 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5691 void DPSOFTRAST_Shutdown(void)
5694 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5696 DPSOFTRAST_State_Thread *thread;
5697 for (i = 0; i < dpsoftrast.numthreads; i++)
5699 thread = &dpsoftrast.threads[i];
5700 Thread_LockMutex(thread->drawmutex);
5702 Thread_CondSignal(thread->drawcond);
5703 Thread_UnlockMutex(thread->drawmutex);
5704 Thread_WaitThread(thread->thread, 0);
5705 Thread_DestroyCond(thread->waitcond);
5706 Thread_DestroyCond(thread->drawcond);
5707 Thread_DestroyMutex(thread->drawmutex);
5710 for (i = 0;i < dpsoftrast.texture_end;i++)
5711 if (dpsoftrast.texture[i].bytes)
5712 MM_FREE(dpsoftrast.texture[i].bytes);
5713 if (dpsoftrast.texture)
5714 free(dpsoftrast.texture);
5715 if (dpsoftrast.threads)
5716 MM_FREE(dpsoftrast.threads);
5717 memset(&dpsoftrast, 0, sizeof(dpsoftrast));