3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
37 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
38 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
39 #elif defined(__GNUC__)
40 #define ALIGN(var) var __attribute__((__aligned__(16)))
41 #define ATOMIC(var) var __attribute__((__aligned__(32)))
42 #define MEMORY_BARRIER (_mm_sfence())
43 //(__sync_synchronize())
44 #define ATOMIC_COUNTER volatile int
45 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
46 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
47 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
48 #elif defined(_MSC_VER)
49 #define ALIGN(var) __declspec(align(16)) var
50 #define ATOMIC(var) __declspec(align(32)) var
51 #define MEMORY_BARRIER (_mm_sfence())
53 #define ATOMIC_COUNTER volatile LONG
54 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
55 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
56 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
61 #define ALIGN(var) var
64 #define ATOMIC(var) var
66 #ifndef MEMORY_BARRIER
67 #define MEMORY_BARRIER ((void)0)
69 #ifndef ATOMIC_COUNTER
70 #define ATOMIC_COUNTER int
72 #ifndef ATOMIC_INCREMENT
73 #define ATOMIC_INCREMENT(counter) (++(counter))
75 #ifndef ATOMIC_DECREMENT
76 #define ATOMIC_DECREMENT(counter) (--(counter))
79 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
83 #include <emmintrin.h>
85 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
86 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
89 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
91 static void *MM_CALLOC(size_t nmemb, size_t size)
93 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
94 if (ptr != NULL) memset(ptr, 0, nmemb*size);
98 #define MM_FREE _mm_free
100 #define MM_MALLOC(size) malloc(size)
101 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
105 typedef enum DPSOFTRAST_ARRAY_e
107 DPSOFTRAST_ARRAY_POSITION,
108 DPSOFTRAST_ARRAY_COLOR,
109 DPSOFTRAST_ARRAY_TEXCOORD0,
110 DPSOFTRAST_ARRAY_TEXCOORD1,
111 DPSOFTRAST_ARRAY_TEXCOORD2,
112 DPSOFTRAST_ARRAY_TEXCOORD3,
113 DPSOFTRAST_ARRAY_TEXCOORD4,
114 DPSOFTRAST_ARRAY_TEXCOORD5,
115 DPSOFTRAST_ARRAY_TEXCOORD6,
116 DPSOFTRAST_ARRAY_TEXCOORD7,
117 DPSOFTRAST_ARRAY_TOTAL
121 typedef struct DPSOFTRAST_Texture_s
128 DPSOFTRAST_TEXTURE_FILTER filter;
131 ATOMIC_COUNTER binds;
132 unsigned char *bytes;
133 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
137 #define COMMAND_SIZE ALIGN_SIZE
138 #define COMMAND_ALIGN(var) ALIGN(var)
140 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
142 unsigned char opcode;
143 unsigned short commandsize;
147 enum { DPSOFTRAST_OPCODE_Reset = 0 };
149 #define DEFCOMMAND(opcodeval, name, fields) \
150 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
151 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
153 unsigned char opcode; \
154 unsigned short commandsize; \
156 } DPSOFTRAST_Command_##name );
158 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
159 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
161 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
165 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
167 DPSOFTRAST_State_Command_Pool);
169 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
171 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
173 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
175 DPSOFTRAST_State_Triangle);
177 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
178 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
179 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
180 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
181 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
183 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
184 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
185 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
186 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
187 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
188 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
189 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
190 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
191 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
194 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
196 typedef ALIGN(struct DPSOFTRAST_State_Span_s
198 int triangle; // triangle this span was generated by
199 int x; // framebuffer x coord
200 int y; // framebuffer y coord
201 int startx; // usable range (according to pixelmask)
202 int endx; // usable range (according to pixelmask)
203 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
204 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
205 int depthslope; // depthbuffer value pixel delta
207 DPSOFTRAST_State_Span);
209 #define DPSOFTRAST_DRAW_MAXSPANS 1024
210 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
211 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
213 #define DPSOFTRAST_VALIDATE_FB 1
214 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
215 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
216 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
218 typedef enum DPSOFTRAST_BLENDMODE_e
220 DPSOFTRAST_BLENDMODE_OPAQUE,
221 DPSOFTRAST_BLENDMODE_ALPHA,
222 DPSOFTRAST_BLENDMODE_ADDALPHA,
223 DPSOFTRAST_BLENDMODE_ADD,
224 DPSOFTRAST_BLENDMODE_INVMOD,
225 DPSOFTRAST_BLENDMODE_MUL,
226 DPSOFTRAST_BLENDMODE_MUL2,
227 DPSOFTRAST_BLENDMODE_SUBALPHA,
228 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
229 DPSOFTRAST_BLENDMODE_INVADD,
230 DPSOFTRAST_BLENDMODE_TOTAL
232 DPSOFTRAST_BLENDMODE;
234 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
253 float polygonoffset[2];
255 ALIGN(float fb_clipplane[4]);
258 int shader_permutation;
259 int shader_exactspecularmath;
261 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
263 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
264 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
266 // DPSOFTRAST_VALIDATE_ flags
269 // derived values (DPSOFTRAST_VALIDATE_FB)
272 ALIGN(float fb_viewportcenter[4]);
273 ALIGN(float fb_viewportscale[4]);
275 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
287 ATOMIC(volatile int commandoffset);
289 volatile bool waiting;
290 volatile bool starving;
297 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
298 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
299 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
301 DPSOFTRAST_State_Thread);
303 typedef ATOMIC(struct DPSOFTRAST_State_s
307 unsigned int *fb_depthpixels;
308 unsigned int *fb_colorpixels[4];
311 ALIGN(float fb_viewportcenter[4]);
312 ALIGN(float fb_viewportscale[4]);
315 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
316 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
318 const float *pointer_vertex3f;
319 const float *pointer_color4f;
320 const unsigned char *pointer_color4ub;
321 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
325 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
330 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
331 float *screencoord4f;
337 int shader_permutation;
338 int shader_exactspecularmath;
342 int texture_firstfree;
343 DPSOFTRAST_Texture *texture;
348 const char *errorstring;
353 DPSOFTRAST_State_Thread *threads;
355 ATOMIC(volatile int drawcommand);
357 DPSOFTRAST_State_Command_Pool commandpool;
361 DPSOFTRAST_State dpsoftrast;
363 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
364 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
365 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
366 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
368 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
369 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
373 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
374 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
375 fb_viewportcenter[3] = 0.5f;
376 fb_viewportcenter[0] = 0.0f;
377 fb_viewportscale[1] = 0.5f * viewport[2];
378 fb_viewportscale[2] = -0.5f * viewport[3];
379 fb_viewportscale[3] = 0.5f;
380 fb_viewportscale[0] = 1.0f;
383 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
385 if (dpsoftrast.interlace)
387 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
388 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
389 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
394 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
395 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
399 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
401 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
402 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
403 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
404 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
405 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
410 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
411 // and viewport projection values
414 x1 = thread->scissor[0];
415 x2 = thread->scissor[0] + thread->scissor[2];
416 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
417 y2 = dpsoftrast.fb_height - thread->scissor[1];
418 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
420 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
422 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
423 thread->fb_scissor[0] = x1;
424 thread->fb_scissor[1] = y1;
425 thread->fb_scissor[2] = x2 - x1;
426 thread->fb_scissor[3] = y2 - y1;
428 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
429 DPSOFTRAST_RecalcClipPlane(thread);
430 DPSOFTRAST_RecalcThread(thread);
433 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
435 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
440 if (thread->blendsubtract)
442 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
444 #define BLENDFUNC(sfactor, dfactor, blendmode) \
445 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
446 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
447 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
454 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
455 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
456 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
457 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
458 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
459 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
460 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
461 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
462 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
463 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
464 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
469 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
471 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
473 mask &= thread->validate;
476 if (mask & DPSOFTRAST_VALIDATE_FB)
478 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
479 DPSOFTRAST_RecalcFB(thread);
481 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
483 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
484 DPSOFTRAST_RecalcDepthFunc(thread);
486 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
488 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
489 DPSOFTRAST_RecalcBlendFunc(thread);
493 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
495 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
496 return &dpsoftrast.texture[index];
500 static void DPSOFTRAST_Texture_Grow(void)
502 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
503 DPSOFTRAST_State_Thread *thread;
507 // expand texture array as needed
508 if (dpsoftrast.texture_max < 1024)
509 dpsoftrast.texture_max = 1024;
511 dpsoftrast.texture_max *= 2;
512 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
513 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
514 if (dpsoftrast.texbound[i])
515 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
516 for (j = 0; j < dpsoftrast.numthreads; j++)
518 thread = &dpsoftrast.threads[j];
519 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
520 if (thread->texbound[i])
521 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
525 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
534 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
535 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
536 DPSOFTRAST_Texture *texture;
537 if (width*height*depth < 1)
539 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
544 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
549 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
550 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
551 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
554 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
556 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
561 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
566 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
571 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
573 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
588 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
593 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596 // find first empty slot in texture array
597 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
598 if (!dpsoftrast.texture[texnum].bytes)
600 dpsoftrast.texture_firstfree = texnum + 1;
601 if (dpsoftrast.texture_max <= texnum)
602 DPSOFTRAST_Texture_Grow();
603 if (dpsoftrast.texture_end <= texnum)
604 dpsoftrast.texture_end = texnum + 1;
605 texture = &dpsoftrast.texture[texnum];
606 memset(texture, 0, sizeof(*texture));
607 texture->flags = flags;
608 texture->width = width;
609 texture->height = height;
610 texture->depth = depth;
611 texture->sides = sides;
623 s = w * h * d * sides * 4;
624 texture->mipmap[mipmaps][0] = size;
625 texture->mipmap[mipmaps][1] = s;
626 texture->mipmap[mipmaps][2] = w;
627 texture->mipmap[mipmaps][3] = h;
628 texture->mipmap[mipmaps][4] = d;
631 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
637 texture->mipmaps = mipmaps;
638 texture->size = size;
640 // allocate the pixels now
641 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
645 void DPSOFTRAST_Texture_Free(int index)
647 DPSOFTRAST_Texture *texture;
648 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
652 MM_FREE(texture->bytes);
653 texture->bytes = NULL;
654 memset(texture, 0, sizeof(*texture));
655 // adjust the free range and used range
656 if (dpsoftrast.texture_firstfree > index)
657 dpsoftrast.texture_firstfree = index;
658 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
659 dpsoftrast.texture_end--;
661 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
663 int i, x, y, z, w, layer0, layer1, row0, row1;
664 unsigned char *o, *i0, *i1, *i2, *i3;
665 DPSOFTRAST_Texture *texture;
666 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
667 if (texture->mipmaps <= 1)
669 for (i = 1;i < texture->mipmaps;i++)
671 for (z = 0;z < texture->mipmap[i][4];z++)
675 if (layer1 >= texture->mipmap[i-1][4])
676 layer1 = texture->mipmap[i-1][4]-1;
677 for (y = 0;y < texture->mipmap[i][3];y++)
681 if (row1 >= texture->mipmap[i-1][3])
682 row1 = texture->mipmap[i-1][3]-1;
683 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
684 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
685 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
686 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
687 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
688 w = texture->mipmap[i][2];
691 if (texture->mipmap[i-1][2] > 1)
693 // average 3D texture
694 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
696 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
697 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
698 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
699 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
704 // average 3D mipmap with parent width == 1
705 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
707 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
708 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
709 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
710 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
716 if (texture->mipmap[i-1][2] > 1)
718 // average 2D texture (common case)
719 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
721 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
722 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
723 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
724 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
729 // 2D texture with parent width == 1
730 o[0] = (i0[0] + i1[0] + 1) >> 1;
731 o[1] = (i0[1] + i1[1] + 1) >> 1;
732 o[2] = (i0[2] + i1[2] + 1) >> 1;
733 o[3] = (i0[3] + i1[3] + 1) >> 1;
740 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
742 DPSOFTRAST_Texture *texture;
744 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
749 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
750 while (blockheight > 0)
752 memcpy(dst, pixels, blockwidth * 4);
753 pixels += blockwidth * 4;
754 dst += texture->mipmap[0][2] * 4;
758 DPSOFTRAST_Texture_CalculateMipmaps(index);
760 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
762 DPSOFTRAST_Texture *texture;
763 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
767 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
768 DPSOFTRAST_Texture_CalculateMipmaps(index);
770 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
772 DPSOFTRAST_Texture *texture;
773 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
774 return texture->mipmap[mip][2];
776 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
778 DPSOFTRAST_Texture *texture;
779 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
780 return texture->mipmap[mip][3];
782 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
784 DPSOFTRAST_Texture *texture;
785 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
786 return texture->mipmap[mip][4];
788 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
790 DPSOFTRAST_Texture *texture;
791 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794 return texture->bytes + texture->mipmap[mip][0];
796 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
798 DPSOFTRAST_Texture *texture;
799 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
800 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
802 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
807 texture->filter = filter;
810 static void DPSOFTRAST_Draw_FlushThreads(void);
812 static void DPSOFTRAST_Draw_SyncCommands(void)
814 if(dpsoftrast.usethreads) MEMORY_BARRIER;
815 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
818 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
820 DPSOFTRAST_State_Thread *thread;
822 int freecommand = dpsoftrast.commandpool.freecommand;
823 int usedcommands = dpsoftrast.commandpool.usedcommands;
824 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
826 DPSOFTRAST_Draw_SyncCommands();
832 for (i = 0; i < dpsoftrast.numthreads; i++)
834 thread = &dpsoftrast.threads[i];
835 commandoffset = freecommand - thread->commandoffset;
836 if (commandoffset < 0)
837 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
838 if (commandoffset > usedcommands)
841 usedcommands = commandoffset;
844 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
846 thread = &dpsoftrast.threads[waitindex];
847 Thread_LockMutex(thread->drawmutex);
848 if (thread->commandoffset != dpsoftrast.drawcommand)
850 thread->waiting = true;
851 if (thread->starving) Thread_CondSignal(thread->drawcond);
852 Thread_CondWait(thread->waitcond, thread->drawmutex);
853 thread->waiting = false;
855 Thread_UnlockMutex(thread->drawmutex);
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 #define DPSOFTRAST_ALIGNCOMMAND(size) \
861 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
862 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
863 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
865 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
867 DPSOFTRAST_Command *command;
868 int freecommand = dpsoftrast.commandpool.freecommand;
869 int usedcommands = dpsoftrast.commandpool.usedcommands;
870 int extra = sizeof(DPSOFTRAST_Command);
871 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
872 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
873 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
875 if (dpsoftrast.usethreads)
876 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
878 DPSOFTRAST_Draw_FlushThreads();
879 freecommand = dpsoftrast.commandpool.freecommand;
880 usedcommands = dpsoftrast.commandpool.usedcommands;
882 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
884 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
885 command->opcode = DPSOFTRAST_OPCODE_Reset;
886 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
890 command->opcode = opcode;
891 command->commandsize = size;
893 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
895 dpsoftrast.commandpool.freecommand = freecommand;
896 dpsoftrast.commandpool.usedcommands = usedcommands + size;
900 static void DPSOFTRAST_UndoCommand(int size)
902 int freecommand = dpsoftrast.commandpool.freecommand;
903 int usedcommands = dpsoftrast.commandpool.usedcommands;
906 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
907 usedcommands -= size;
908 dpsoftrast.commandpool.freecommand = freecommand;
909 dpsoftrast.commandpool.usedcommands = usedcommands;
912 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
913 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
915 thread->viewport[0] = command->x;
916 thread->viewport[1] = command->y;
917 thread->viewport[2] = command->width;
918 thread->viewport[3] = command->height;
919 thread->validate |= DPSOFTRAST_VALIDATE_FB;
921 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
923 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
926 command->width = width;
927 command->height = height;
929 dpsoftrast.viewport[0] = x;
930 dpsoftrast.viewport[1] = y;
931 dpsoftrast.viewport[2] = width;
932 dpsoftrast.viewport[3] = height;
933 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
936 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
937 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
939 int i, x1, y1, x2, y2, w, h, x, y;
940 int miny1, maxy1, miny2, maxy2;
944 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
945 miny1 = thread->miny1;
946 maxy1 = thread->maxy1;
947 miny2 = thread->miny2;
948 maxy2 = thread->maxy2;
949 x1 = thread->fb_scissor[0];
950 y1 = thread->fb_scissor[1];
951 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953 if (y1 < miny1) y1 = miny1;
954 if (y2 > maxy2) y2 = maxy2;
959 // FIXME: honor fb_colormask?
960 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
961 for (i = 0;i < 4;i++)
963 if (!dpsoftrast.fb_colorpixels[i])
965 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
968 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
969 for (x = x1;x < x2;x++)
974 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
976 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
983 DEFCOMMAND(3, ClearDepth, float depth;)
984 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
986 int x1, y1, x2, y2, w, h, x, y;
987 int miny1, maxy1, miny2, maxy2;
991 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
992 miny1 = thread->miny1;
993 maxy1 = thread->maxy1;
994 miny2 = thread->miny2;
995 maxy2 = thread->maxy2;
996 x1 = thread->fb_scissor[0];
997 y1 = thread->fb_scissor[1];
998 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
999 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1000 if (y1 < miny1) y1 = miny1;
1001 if (y2 > maxy2) y2 = maxy2;
1006 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1007 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1008 for (;y < bandy;y++)
1010 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1011 for (x = x1;x < x2;x++)
1015 void DPSOFTRAST_ClearDepth(float d)
1017 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1021 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1022 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1024 thread->colormask[0] = command->r != 0;
1025 thread->colormask[1] = command->g != 0;
1026 thread->colormask[2] = command->b != 0;
1027 thread->colormask[3] = command->a != 0;
1028 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1030 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1032 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1039 DEFCOMMAND(5, DepthTest, int enable;)
1040 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1042 thread->depthtest = command->enable;
1043 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1045 void DPSOFTRAST_DepthTest(int enable)
1047 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1048 command->enable = enable;
1051 DEFCOMMAND(6, ScissorTest, int enable;)
1052 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1054 thread->scissortest = command->enable;
1055 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1057 void DPSOFTRAST_ScissorTest(int enable)
1059 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1060 command->enable = enable;
1063 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1064 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1066 thread->scissor[0] = command->x;
1067 thread->scissor[1] = command->y;
1068 thread->scissor[2] = command->width;
1069 thread->scissor[3] = command->height;
1070 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1072 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1074 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1077 command->width = width;
1078 command->height = height;
1081 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1082 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1084 thread->blendfunc[0] = command->sfactor;
1085 thread->blendfunc[1] = command->dfactor;
1086 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1088 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1090 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1091 command->sfactor = sfactor;
1092 command->dfactor = dfactor;
1095 DEFCOMMAND(9, BlendSubtract, int enable;)
1096 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1098 thread->blendsubtract = command->enable;
1099 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1101 void DPSOFTRAST_BlendSubtract(int enable)
1103 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1104 command->enable = enable;
1107 DEFCOMMAND(10, DepthMask, int enable;)
1108 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1110 thread->depthmask = command->enable;
1112 void DPSOFTRAST_DepthMask(int enable)
1114 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1115 command->enable = enable;
1118 DEFCOMMAND(11, DepthFunc, int func;)
1119 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1121 thread->depthfunc = command->func;
1123 void DPSOFTRAST_DepthFunc(int func)
1125 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1126 command->func = func;
1129 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1130 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1132 thread->depthrange[0] = command->nearval;
1133 thread->depthrange[1] = command->farval;
1135 void DPSOFTRAST_DepthRange(float nearval, float farval)
1137 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1138 command->nearval = nearval;
1139 command->farval = farval;
1142 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1143 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1145 thread->polygonoffset[0] = command->alongnormal;
1146 thread->polygonoffset[1] = command->intoview;
1148 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1150 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1151 command->alongnormal = alongnormal;
1152 command->intoview = intoview;
1155 DEFCOMMAND(14, CullFace, int mode;)
1156 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1158 thread->cullface = command->mode;
1160 void DPSOFTRAST_CullFace(int mode)
1162 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1163 command->mode = mode;
1166 DEFCOMMAND(15, AlphaTest, int enable;)
1167 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1169 thread->alphatest = command->enable;
1171 void DPSOFTRAST_AlphaTest(int enable)
1173 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1174 command->enable = enable;
1177 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1178 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1180 thread->alphafunc = command->func;
1181 thread->alphavalue = command->ref;
1183 void DPSOFTRAST_AlphaFunc(int func, float ref)
1185 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1186 command->func = func;
1190 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1192 dpsoftrast.color[0] = r;
1193 dpsoftrast.color[1] = g;
1194 dpsoftrast.color[2] = b;
1195 dpsoftrast.color[3] = a;
1198 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1200 int outstride = blockwidth * 4;
1201 int instride = dpsoftrast.fb_width * 4;
1204 int bx2 = blockx + blockwidth;
1205 int by2 = blocky + blockheight;
1209 unsigned char *inpixels;
1213 if (bx1 < 0) bx1 = 0;
1214 if (by1 < 0) by1 = 0;
1215 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1216 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1218 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1219 if (dpsoftrast.bigendian)
1221 for (y = by1;y < by2;y++)
1223 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1224 o = (unsigned char *)outpixels + (y - by1) * outstride;
1225 for (x = bx1;x < bx2;x++)
1238 for (y = by1;y < by2;y++)
1240 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1241 o = (unsigned char *)outpixels + (y - by1) * outstride;
1247 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1251 int tx2 = tx + width;
1252 int ty2 = ty + height;
1255 int sx2 = sx + width;
1256 int sy2 = sy + height;
1266 unsigned int *spixels;
1267 unsigned int *tpixels;
1268 DPSOFTRAST_Texture *texture;
1269 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1270 if (mip < 0 || mip >= texture->mipmaps) return;
1272 spixels = dpsoftrast.fb_colorpixels[0];
1273 swidth = dpsoftrast.fb_width;
1274 sheight = dpsoftrast.fb_height;
1275 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1276 twidth = texture->mipmap[mip][2];
1277 theight = texture->mipmap[mip][3];
1278 if (tx1 < 0) tx1 = 0;
1279 if (ty1 < 0) ty1 = 0;
1280 if (tx2 > twidth) tx2 = twidth;
1281 if (ty2 > theight) ty2 = theight;
1282 if (sx1 < 0) sx1 = 0;
1283 if (sy1 < 0) sy1 = 0;
1284 if (sx2 > swidth) sx2 = swidth;
1285 if (sy2 > sheight) sy2 = sheight;
1290 if (tw > sw) tw = sw;
1291 if (th > sh) th = sh;
1292 if (tw < 1 || th < 1)
1294 sy1 = sheight - 1 - sy1;
1295 for (y = 0;y < th;y++)
1296 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1297 if (texture->mipmaps > 1)
1298 DPSOFTRAST_Texture_CalculateMipmaps(index);
1301 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1302 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1304 if (thread->texbound[command->unitnum])
1305 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1306 thread->texbound[command->unitnum] = command->texture;
1308 void DPSOFTRAST_SetTexture(int unitnum, int index)
1310 DPSOFTRAST_Command_SetTexture *command;
1311 DPSOFTRAST_Texture *texture;
1312 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1314 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1317 texture = DPSOFTRAST_Texture_GetByIndex(index);
1318 if (index && !texture)
1320 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1324 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1325 command->unitnum = unitnum;
1326 command->texture = texture;
1328 dpsoftrast.texbound[unitnum] = texture;
1329 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1332 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1334 dpsoftrast.pointer_vertex3f = vertex3f;
1335 dpsoftrast.stride_vertex = stride;
1337 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1339 dpsoftrast.pointer_color4f = color4f;
1340 dpsoftrast.pointer_color4ub = NULL;
1341 dpsoftrast.stride_color = stride;
1343 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1345 dpsoftrast.pointer_color4f = NULL;
1346 dpsoftrast.pointer_color4ub = color4ub;
1347 dpsoftrast.stride_color = stride;
1349 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1351 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1352 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1353 dpsoftrast.stride_texcoord[unitnum] = stride;
1356 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1357 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1359 thread->shader_mode = command->mode;
1360 thread->shader_permutation = command->permutation;
1361 thread->shader_exactspecularmath = command->exactspecularmath;
1363 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1365 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1366 command->mode = mode;
1367 command->permutation = permutation;
1368 command->exactspecularmath = exactspecularmath;
1370 dpsoftrast.shader_mode = mode;
1371 dpsoftrast.shader_permutation = permutation;
1372 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1375 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1376 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1378 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1380 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1382 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383 command->index = index;
1384 command->val[0] = v0;
1385 command->val[1] = v1;
1386 command->val[2] = v2;
1387 command->val[3] = v3;
1389 dpsoftrast.uniform4f[index*4+0] = v0;
1390 dpsoftrast.uniform4f[index*4+1] = v1;
1391 dpsoftrast.uniform4f[index*4+2] = v2;
1392 dpsoftrast.uniform4f[index*4+3] = v3;
1394 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1396 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1397 command->index = index;
1398 memcpy(command->val, v, sizeof(command->val));
1400 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1403 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1404 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1406 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1408 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1412 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1414 __m128 m0, m1, m2, m3;
1415 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1416 command->index = (DPSOFTRAST_UNIFORM)index;
1417 if (((size_t)v)&(ALIGN_SIZE-1))
1419 m0 = _mm_loadu_ps(v);
1420 m1 = _mm_loadu_ps(v+4);
1421 m2 = _mm_loadu_ps(v+8);
1422 m3 = _mm_loadu_ps(v+12);
1426 m0 = _mm_load_ps(v);
1427 m1 = _mm_load_ps(v+4);
1428 m2 = _mm_load_ps(v+8);
1429 m3 = _mm_load_ps(v+12);
1433 __m128 t0, t1, t2, t3;
1434 t0 = _mm_unpacklo_ps(m0, m1);
1435 t1 = _mm_unpacklo_ps(m2, m3);
1436 t2 = _mm_unpackhi_ps(m0, m1);
1437 t3 = _mm_unpackhi_ps(m2, m3);
1438 m0 = _mm_movelh_ps(t0, t1);
1439 m1 = _mm_movehl_ps(t1, t0);
1440 m2 = _mm_movelh_ps(t2, t3);
1441 m3 = _mm_movehl_ps(t3, t2);
1443 _mm_store_ps(command->val, m0);
1444 _mm_store_ps(command->val+4, m1);
1445 _mm_store_ps(command->val+8, m2);
1446 _mm_store_ps(command->val+12, m3);
1447 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1448 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1449 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1450 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1455 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1456 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1458 thread->uniform1i[command->index] = command->val;
1460 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1462 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1463 command->index = index;
1466 dpsoftrast.uniform1i[command->index] = i0;
1469 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1470 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1472 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1473 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1475 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1477 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1478 command->clipplane[0] = x;
1479 command->clipplane[1] = y;
1480 command->clipplane[2] = z;
1481 command->clipplane[3] = w;
1485 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1487 float *end = dst + size*4;
1488 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1492 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1501 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1508 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1510 float *end = dst + size*4;
1511 if (stride == sizeof(float[3]))
1513 float *end4 = dst + (size&~3)*4;
1514 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1518 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1519 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1520 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1521 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1523 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1524 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1525 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1526 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1527 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1530 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 src += 4*sizeof(float[3]);
1539 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1540 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1541 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1542 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1543 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1544 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1545 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1546 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1547 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1548 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1549 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1550 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1551 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1553 src += 4*sizeof(float[3]);
1557 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1561 __m128 v = _mm_loadu_ps((const float *)src);
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565 _mm_store_ps(dst, v);
1574 __m128 v = _mm_load_ps((const float *)src);
1575 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1576 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1577 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1578 _mm_store_ps(dst, v);
1585 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1587 float *end = dst + size*4;
1588 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1589 if (stride == sizeof(float[2]))
1591 float *end2 = dst + (size&~1)*4;
1592 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1596 __m128 v = _mm_loadu_ps((const float *)src);
1597 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1600 src += 2*sizeof(float[2]);
1607 __m128 v = _mm_load_ps((const float *)src);
1608 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1609 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1611 src += 2*sizeof(float[2]);
1617 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1623 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1625 float *end = dst + size*4;
1626 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1627 if (stride == sizeof(unsigned char[4]))
1629 float *end4 = dst + (size&~3)*4;
1630 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1634 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640 src += 4*sizeof(unsigned char[4]);
1647 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1648 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1649 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1650 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1651 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1653 src += 4*sizeof(unsigned char[4]);
1659 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1660 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1666 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1668 float *end = dst + 4*size;
1669 __m128 v = _mm_loadu_ps(src);
1672 _mm_store_ps(dst, v);
1678 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1681 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1682 __m128 m0, m1, m2, m3;
1684 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1686 // fast case for identity matrix
1687 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1690 end = out4f + numitems*4;
1691 m0 = _mm_loadu_ps(inmatrix16f);
1692 m1 = _mm_loadu_ps(inmatrix16f + 4);
1693 m2 = _mm_loadu_ps(inmatrix16f + 8);
1694 m3 = _mm_loadu_ps(inmatrix16f + 12);
1695 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1699 __m128 v = _mm_loadu_ps(in4f);
1701 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 __m128 v = _mm_load_ps(in4f);
1715 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1716 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1717 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1718 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1726 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1728 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1732 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1734 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1735 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1736 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1737 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1740 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1742 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1743 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1744 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1745 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1748 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1751 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1752 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1753 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1754 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1757 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1759 int clipmask = 0xFF;
1760 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1761 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1762 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1763 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1764 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1765 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1766 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1767 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1768 #define BBFRONT(k, pos) \
1770 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1771 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1772 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1775 clipmask &= ~(1<<k); \
1776 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1777 minproj = _mm_min_ss(minproj, proj); \
1778 maxproj = _mm_max_ss(maxproj, proj); \
1782 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1783 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1784 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1785 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1786 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1787 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1791 if (clipmask&(1<<k)) \
1793 if (!(clipmask&(1<<(k^1)))) \
1795 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1796 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1797 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1798 minproj = _mm_min_ss(minproj, proj); \
1799 maxproj = _mm_max_ss(maxproj, proj); \
1801 if (!(clipmask&(1<<(k^2)))) \
1803 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1804 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1805 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1806 minproj = _mm_min_ss(minproj, proj); \
1807 maxproj = _mm_max_ss(maxproj, proj); \
1809 if (!(clipmask&(1<<(k^4)))) \
1811 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1812 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1813 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1814 minproj = _mm_min_ss(minproj, proj); \
1815 maxproj = _mm_max_ss(maxproj, proj); \
1819 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1820 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1821 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1822 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1823 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1824 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1825 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1826 *starty = _mm_cvttss_si32(maxproj);
1827 *endy = _mm_cvttss_si32(minproj)+1;
1831 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1833 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1834 float *end = out4f + numitems*4;
1835 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1836 __m128 minpos, maxpos;
1837 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1839 minpos = maxpos = _mm_loadu_ps(in4f);
1842 __m128 v = _mm_loadu_ps(in4f);
1843 minpos = _mm_min_ps(minpos, v);
1844 maxpos = _mm_max_ps(maxpos, v);
1845 _mm_store_ps(out4f, v);
1846 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1847 _mm_store_ps(screen4f, v);
1855 minpos = maxpos = _mm_load_ps(in4f);
1858 __m128 v = _mm_load_ps(in4f);
1859 minpos = _mm_min_ps(minpos, v);
1860 maxpos = _mm_max_ps(maxpos, v);
1861 _mm_store_ps(out4f, v);
1862 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1863 _mm_store_ps(screen4f, v);
1871 ALIGN(float minposf[4]);
1872 ALIGN(float maxposf[4]);
1873 _mm_store_ps(minposf, minpos);
1874 _mm_store_ps(maxposf, maxpos);
1875 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1880 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1882 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1883 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1885 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1886 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1887 end = out4f + numitems*4;
1888 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1889 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1890 m0 = _mm_loadu_ps(inmatrix16f);
1891 m1 = _mm_loadu_ps(inmatrix16f + 4);
1892 m2 = _mm_loadu_ps(inmatrix16f + 8);
1893 m3 = _mm_loadu_ps(inmatrix16f + 12);
1894 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1896 minpos = maxpos = _mm_loadu_ps(in4f);
1899 __m128 v = _mm_loadu_ps(in4f);
1900 minpos = _mm_min_ps(minpos, v);
1901 maxpos = _mm_max_ps(maxpos, v);
1902 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1903 _mm_store_ps(out4f, v);
1904 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1905 _mm_store_ps(screen4f, v);
1913 minpos = maxpos = _mm_load_ps(in4f);
1916 __m128 v = _mm_load_ps(in4f);
1917 minpos = _mm_min_ps(minpos, v);
1918 maxpos = _mm_max_ps(maxpos, v);
1919 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1920 _mm_store_ps(out4f, v);
1921 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1922 _mm_store_ps(screen4f, v);
1930 ALIGN(float minposf[4]);
1931 ALIGN(float maxposf[4]);
1932 _mm_store_ps(minposf, minpos);
1933 _mm_store_ps(maxposf, maxpos);
1934 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1940 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1943 float *outf = dpsoftrast.post_array4f[outarray];
1944 const unsigned char *inb;
1945 int firstvertex = dpsoftrast.firstvertex;
1946 int numvertices = dpsoftrast.numvertices;
1950 case DPSOFTRAST_ARRAY_POSITION:
1951 stride = dpsoftrast.stride_vertex;
1952 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1953 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1955 case DPSOFTRAST_ARRAY_COLOR:
1956 stride = dpsoftrast.stride_color;
1957 if (dpsoftrast.pointer_color4f)
1959 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1960 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1962 else if (dpsoftrast.pointer_color4ub)
1964 stride = dpsoftrast.stride_color;
1965 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1966 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1970 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1974 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1975 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1977 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1978 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1981 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1984 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1987 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1999 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
2007 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2010 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2011 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2019 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2022 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2023 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2030 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2033 int startx = span->startx;
2034 int endx = span->endx;
2035 float wslope = triangle->w[0];
2036 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2037 float endz = 1.0f / (w + wslope * startx);
2038 if (triangle->w[0] == 0)
2040 // LordHavoc: fast flat polygons (HUD/menu)
2041 for (x = startx;x < endx;x++)
2045 for (x = startx;x < endx;)
2047 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2049 if (nextsub >= endx) nextsub = endsub = endx-1;
2050 endz = 1.0f / (w + wslope * nextsub);
2051 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2052 for (; x <= endsub; x++, z += dz)
2057 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2061 int startx = span->startx;
2062 int endx = span->endx;
2065 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2066 unsigned char * RESTRICT pixelmask = span->pixelmask;
2067 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2068 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2071 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2072 pixeli += span->y * dpsoftrast.fb_width + span->x;
2073 // handle alphatest now (this affects depth writes too)
2074 if (thread->alphatest)
2075 for (x = startx;x < endx;x++)
2076 if (in4ub[x*4+3] < 128)
2077 pixelmask[x] = false;
2078 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2079 // helps sprites, text and hud artwork
2080 switch(thread->fb_blendmode)
2082 case DPSOFTRAST_BLENDMODE_ALPHA:
2083 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2086 for (x = startx;x < endx;x++)
2088 if (in4ub[x*4+3] >= 1)
2093 while (++x < endx && in4ub[x*4+3] >= 1) ;
2095 if (x >= endx) break;
2097 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2098 if (x >= endx) break;
2105 case DPSOFTRAST_BLENDMODE_OPAQUE:
2106 case DPSOFTRAST_BLENDMODE_ADD:
2107 case DPSOFTRAST_BLENDMODE_INVMOD:
2108 case DPSOFTRAST_BLENDMODE_MUL:
2109 case DPSOFTRAST_BLENDMODE_MUL2:
2110 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2111 case DPSOFTRAST_BLENDMODE_INVADD:
2114 // put some special values at the end of the mask to ensure the loops end
2115 pixelmask[endx] = 1;
2116 pixelmask[endx+1] = 0;
2117 // LordHavoc: use a double loop to identify subspans, this helps the
2118 // optimized copy/blend loops to perform at their best, most triangles
2119 // have only one run of pixels, and do the search using wide reads...
2123 // if this pixel is masked off, it's probably not alone...
2130 // the 4-item search must be aligned or else it stalls badly
2131 if ((x & 3) && !pixelmask[x])
2133 if(pixelmask[x]) goto endmasked;
2137 if(pixelmask[x]) goto endmasked;
2141 if(pixelmask[x]) goto endmasked;
2146 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2150 for (;!pixelmask[x];x++)
2152 // rather than continue the loop, just check the end variable
2157 // find length of subspan
2160 if (subx + 8 < endx)
2164 if(!pixelmask[subx]) goto endunmasked;
2168 if(!pixelmask[subx]) goto endunmasked;
2172 if(!pixelmask[subx]) goto endunmasked;
2177 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2181 for (;pixelmask[subx];subx++)
2183 // the checks can overshoot, so make sure to clip it...
2187 // now that we know the subspan length... process!
2188 switch(thread->fb_blendmode)
2190 case DPSOFTRAST_BLENDMODE_OPAQUE:
2194 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2199 while (x + 16 <= subx)
2201 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2202 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2203 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2204 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2209 while (x + 4 <= subx)
2211 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2217 pixeli[x+1] = ini[x+1];
2227 case DPSOFTRAST_BLENDMODE_ALPHA:
2228 #define FINISHBLEND(blend2, blend1) \
2229 for (;x + 1 < subx;x += 2) \
2232 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2235 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2240 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2241 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2243 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2254 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2256 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2257 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2259 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2260 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263 case DPSOFTRAST_BLENDMODE_ADD:
2264 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2266 case DPSOFTRAST_BLENDMODE_INVMOD:
2268 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2270 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2273 case DPSOFTRAST_BLENDMODE_MUL:
2274 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2276 case DPSOFTRAST_BLENDMODE_MUL2:
2277 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2279 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2288 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2290 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2291 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2293 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2294 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2297 case DPSOFTRAST_BLENDMODE_INVADD:
2299 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2301 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2309 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2310 // warning: this is SLOW, only use if the optimized per-span functions won't do
2312 const unsigned char * RESTRICT pixelbase;
2313 const unsigned char * RESTRICT pixel[4];
2314 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2315 int wrapmask[2] = { width-1, height-1 };
2316 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2317 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2319 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2320 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2321 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2322 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2323 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2324 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2325 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2327 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2328 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2329 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2330 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2334 tci[0] &= wrapmask[0];
2335 tci[1] &= wrapmask[1];
2336 tci1[0] &= wrapmask[0];
2337 tci1[1] &= wrapmask[1];
2339 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2340 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2341 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2342 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2343 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2344 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2345 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2346 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2350 int tci[2] = { x * width, y * height };
2351 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2353 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2354 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2358 tci[0] &= wrapmask[0];
2359 tci[1] &= wrapmask[1];
2361 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2369 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2372 int startx = span->startx;
2373 int endx = span->endx;
2378 float tc[2], endtc[2];
2380 unsigned int tci[2];
2381 unsigned int tci1[2];
2382 unsigned int tcimin[2];
2383 unsigned int tcimax[2];
2388 const unsigned char * RESTRICT pixelbase;
2389 const unsigned char * RESTRICT pixel[4];
2390 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2391 // if no texture is bound, just fill it with white
2394 for (x = startx;x < endx;x++)
2396 out4f[x*4+0] = 1.0f;
2397 out4f[x*4+1] = 1.0f;
2398 out4f[x*4+2] = 1.0f;
2399 out4f[x*4+3] = 1.0f;
2403 mip = triangle->mip[texunitindex];
2404 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2405 // if this mipmap of the texture is 1 pixel, just fill it with that color
2406 if (texture->mipmap[mip][1] == 4)
2408 c[0] = texture->bytes[2] * (1.0f/255.0f);
2409 c[1] = texture->bytes[1] * (1.0f/255.0f);
2410 c[2] = texture->bytes[0] * (1.0f/255.0f);
2411 c[3] = texture->bytes[3] * (1.0f/255.0f);
2412 for (x = startx;x < endx;x++)
2414 out4f[x*4+0] = c[0];
2415 out4f[x*4+1] = c[1];
2416 out4f[x*4+2] = c[2];
2417 out4f[x*4+3] = c[3];
2421 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2422 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2423 flags = texture->flags;
2424 tcscale[0] = texture->mipmap[mip][2];
2425 tcscale[1] = texture->mipmap[mip][3];
2426 tciwidth = texture->mipmap[mip][2];
2429 tcimax[0] = texture->mipmap[mip][2]-1;
2430 tcimax[1] = texture->mipmap[mip][3]-1;
2431 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2432 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2433 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2434 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2440 for (x = startx;x < endx;)
2442 unsigned int subtc[2];
2443 unsigned int substep[2];
2444 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2445 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2446 if (nextsub >= endx)
2448 nextsub = endsub = endx-1;
2449 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2453 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2454 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2460 substep[0] = (endtc[0] - tc[0]) * subscale;
2461 substep[1] = (endtc[1] - tc[1]) * subscale;
2462 subtc[0] = tc[0] * (1<<12);
2463 subtc[1] = tc[1] * (1<<12);
2466 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2468 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2470 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2471 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2472 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2473 tci[0] = subtc[0]>>12;
2474 tci[1] = subtc[1]>>12;
2475 tci1[0] = tci[0] + 1;
2476 tci1[1] = tci[1] + 1;
2477 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2478 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2479 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2480 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2481 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2482 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2483 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2484 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2485 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2486 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2487 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2488 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2489 out4f[x*4+0] = c[0];
2490 out4f[x*4+1] = c[1];
2491 out4f[x*4+2] = c[2];
2492 out4f[x*4+3] = c[3];
2497 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2499 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2500 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2501 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2502 tci[0] = subtc[0]>>12;
2503 tci[1] = subtc[1]>>12;
2504 tci1[0] = tci[0] + 1;
2505 tci1[1] = tci[1] + 1;
2506 tci[0] &= tciwrapmask[0];
2507 tci[1] &= tciwrapmask[1];
2508 tci1[0] &= tciwrapmask[0];
2509 tci1[1] &= tciwrapmask[1];
2510 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2511 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2512 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2513 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2514 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2515 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2516 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2517 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2518 out4f[x*4+0] = c[0];
2519 out4f[x*4+1] = c[1];
2520 out4f[x*4+2] = c[2];
2521 out4f[x*4+3] = c[3];
2525 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2527 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2529 tci[0] = subtc[0]>>12;
2530 tci[1] = subtc[1]>>12;
2531 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2532 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2533 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2534 c[0] = pixel[0][2] * (1.0f / 255.0f);
2535 c[1] = pixel[0][1] * (1.0f / 255.0f);
2536 c[2] = pixel[0][0] * (1.0f / 255.0f);
2537 c[3] = pixel[0][3] * (1.0f / 255.0f);
2538 out4f[x*4+0] = c[0];
2539 out4f[x*4+1] = c[1];
2540 out4f[x*4+2] = c[2];
2541 out4f[x*4+3] = c[3];
2546 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2548 tci[0] = subtc[0]>>12;
2549 tci[1] = subtc[1]>>12;
2550 tci[0] &= tciwrapmask[0];
2551 tci[1] &= tciwrapmask[1];
2552 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2553 c[0] = pixel[0][2] * (1.0f / 255.0f);
2554 c[1] = pixel[0][1] * (1.0f / 255.0f);
2555 c[2] = pixel[0][0] * (1.0f / 255.0f);
2556 c[3] = pixel[0][3] * (1.0f / 255.0f);
2557 out4f[x*4+0] = c[0];
2558 out4f[x*4+1] = c[1];
2559 out4f[x*4+2] = c[2];
2560 out4f[x*4+3] = c[3];
2566 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2570 int startx = span->startx;
2571 int endx = span->endx;
2573 __m128 data, slope, tcscale;
2574 __m128i tcsize, tcmask, tcoffset, tcmax;
2576 __m128i subtc, substep, endsubtc;
2579 int affine; // LordHavoc: optimized affine texturing case
2580 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2581 const unsigned char * RESTRICT pixelbase;
2582 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2583 // if no texture is bound, just fill it with white
2586 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2589 mip = triangle->mip[texunitindex];
2590 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2591 // if this mipmap of the texture is 1 pixel, just fill it with that color
2592 if (texture->mipmap[mip][1] == 4)
2594 unsigned int k = *((const unsigned int *)pixelbase);
2595 for (x = startx;x < endx;x++)
2599 affine = zf[startx] == zf[endx-1];
2600 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2601 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2602 flags = texture->flags;
2603 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2604 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2605 tcscale = _mm_cvtepi32_ps(tcsize);
2606 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2607 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2608 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2610 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2611 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2612 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2613 tcmax = _mm_packs_epi32(tcmask, tcmask);
2614 for (x = startx;x < endx;)
2616 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2617 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2618 if (nextsub >= endx || affine)
2620 nextsub = endsub = endx-1;
2621 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2625 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2627 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2628 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2629 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2630 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2631 substep = _mm_slli_epi32(substep, 1);
2634 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2635 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2637 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2638 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2640 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2641 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2642 tci = _mm_madd_epi16(tci, tcoffset);
2643 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2644 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2645 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2646 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2647 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2648 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2649 fracm = _mm_srli_epi16(subtc, 1);
2650 pix1 = _mm_add_epi16(pix1,
2651 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2652 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2653 pix3 = _mm_add_epi16(pix3,
2654 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2655 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2656 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2657 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2658 pix2 = _mm_add_epi16(pix2,
2659 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2660 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2661 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2665 const unsigned char * RESTRICT ptr1;
2666 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2667 tci = _mm_madd_epi16(tci, tcoffset);
2668 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2669 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2670 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2671 fracm = _mm_srli_epi16(subtc, 1);
2672 pix1 = _mm_add_epi16(pix1,
2673 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2674 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2675 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2676 pix1 = _mm_add_epi16(pix1,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2679 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2683 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2685 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2687 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2688 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2689 tci = _mm_madd_epi16(tci, tcoffset);
2690 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692 _mm_setzero_si128());
2693 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695 _mm_setzero_si128());
2696 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2697 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2698 tci = _mm_madd_epi16(tci, tcoffset);
2699 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2700 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2701 _mm_setzero_si128());
2702 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2703 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2704 _mm_setzero_si128());
2705 fracm = _mm_srli_epi16(subtc, 1);
2706 pix1 = _mm_add_epi16(pix1,
2707 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2708 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2709 pix3 = _mm_add_epi16(pix3,
2710 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2711 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2712 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2713 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2714 pix2 = _mm_add_epi16(pix2,
2715 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2716 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2717 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2721 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2722 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2723 tci = _mm_madd_epi16(tci, tcoffset);
2724 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2725 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2726 _mm_setzero_si128());
2727 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2728 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2729 _mm_setzero_si128());
2730 fracm = _mm_srli_epi16(subtc, 1);
2731 pix1 = _mm_add_epi16(pix1,
2732 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2733 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2734 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2735 pix1 = _mm_add_epi16(pix1,
2736 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2737 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2738 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2744 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2746 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2747 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751 _mm_setzero_si128());
2752 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754 _mm_setzero_si128());
2755 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2756 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2757 tci = _mm_madd_epi16(tci, tcoffset);
2758 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2759 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2760 _mm_setzero_si128());
2761 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2762 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2763 _mm_setzero_si128());
2764 fracm = _mm_srli_epi16(subtc, 1);
2765 pix1 = _mm_add_epi16(pix1,
2766 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2767 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2768 pix3 = _mm_add_epi16(pix3,
2769 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2770 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2771 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2772 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2773 pix2 = _mm_add_epi16(pix2,
2774 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2775 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2776 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2780 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2781 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2782 tci = _mm_madd_epi16(tci, tcoffset);
2783 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2784 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2785 _mm_setzero_si128());
2786 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2787 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2788 _mm_setzero_si128());
2789 fracm = _mm_srli_epi16(subtc, 1);
2790 pix1 = _mm_add_epi16(pix1,
2791 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2792 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2793 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2794 pix1 = _mm_add_epi16(pix1,
2795 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2796 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2797 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2804 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2806 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2808 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2809 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2810 tci = _mm_madd_epi16(tci, tcoffset);
2811 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2812 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2816 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2817 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2818 tci = _mm_madd_epi16(tci, tcoffset);
2819 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2825 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2827 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2828 tci = _mm_and_si128(tci, tcmax);
2829 tci = _mm_madd_epi16(tci, tcoffset);
2830 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2831 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2835 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2836 tci = _mm_and_si128(tci, tcmax);
2837 tci = _mm_madd_epi16(tci, tcoffset);
2838 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2847 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2850 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2853 float DPSOFTRAST_SampleShadowmap(const float *vector)
2859 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2862 int startx = span->startx;
2863 int endx = span->endx;
2868 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2869 for (x = startx;x < endx;x++)
2872 c[0] = (data[0] + slope[0]*x) * z;
2873 c[1] = (data[1] + slope[1]*x) * z;
2874 c[2] = (data[2] + slope[2]*x) * z;
2875 c[3] = (data[3] + slope[3]*x) * z;
2876 out4f[x*4+0] = in4f[x*4+0] * c[0];
2877 out4f[x*4+1] = in4f[x*4+1] * c[1];
2878 out4f[x*4+2] = in4f[x*4+2] * c[2];
2879 out4f[x*4+3] = in4f[x*4+3] * c[3];
2883 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2886 int startx = span->startx;
2887 int endx = span->endx;
2892 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2893 for (x = startx;x < endx;x++)
2896 c[0] = (data[0] + slope[0]*x) * z;
2897 c[1] = (data[1] + slope[1]*x) * z;
2898 c[2] = (data[2] + slope[2]*x) * z;
2899 c[3] = (data[3] + slope[3]*x) * z;
2900 out4f[x*4+0] = c[0];
2901 out4f[x*4+1] = c[1];
2902 out4f[x*4+2] = c[2];
2903 out4f[x*4+3] = c[3];
2907 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2909 int x, startx = span->startx, endx = span->endx;
2910 float c[4], localcolor[4];
2911 localcolor[0] = subcolor[0];
2912 localcolor[1] = subcolor[1];
2913 localcolor[2] = subcolor[2];
2914 localcolor[3] = subcolor[3];
2915 for (x = startx;x < endx;x++)
2917 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2918 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2919 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2920 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2921 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2922 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2923 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2924 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2928 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2930 int x, startx = span->startx, endx = span->endx;
2931 for (x = startx;x < endx;x++)
2933 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2940 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2942 int x, startx = span->startx, endx = span->endx;
2943 for (x = startx;x < endx;x++)
2945 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2946 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2947 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2948 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2952 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2954 int x, startx = span->startx, endx = span->endx;
2956 for (x = startx;x < endx;x++)
2958 a = 1.0f - inb4f[x*4+3];
2960 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2961 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2962 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2963 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2967 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2969 int x, startx = span->startx, endx = span->endx;
2970 float localcolor[4], ilerp, lerp;
2971 localcolor[0] = color[0];
2972 localcolor[1] = color[1];
2973 localcolor[2] = color[2];
2974 localcolor[3] = color[3];
2975 ilerp = 1.0f - localcolor[3];
2976 lerp = localcolor[3];
2977 for (x = startx;x < endx;x++)
2979 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2980 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2981 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2982 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2988 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2992 int startx = span->startx;
2993 int endx = span->endx;
2996 __m128i submod, substep, endsubmod;
2997 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2998 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2999 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3000 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3001 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3002 for (x = startx; x < endx;)
3004 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3005 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3006 if (nextsub >= endx)
3008 nextsub = endsub = endx-1;
3009 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3013 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3014 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3015 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3016 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3017 substep = _mm_packs_epi32(substep, substep);
3018 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3020 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3021 pix = _mm_mulhi_epu16(pix, submod);
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3026 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3027 pix = _mm_mulhi_epu16(pix, submod);
3028 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3035 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3039 int startx = span->startx;
3040 int endx = span->endx;
3043 __m128i submod, substep, endsubmod;
3044 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3045 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3046 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3047 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3048 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3049 for (x = startx; x < endx;)
3051 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3052 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3053 if (nextsub >= endx)
3055 nextsub = endsub = endx-1;
3056 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3060 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3061 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3062 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3063 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3064 substep = _mm_packs_epi32(substep, substep);
3065 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3067 __m128i pix = _mm_srai_epi16(submod, 4);
3068 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3072 __m128i pix = _mm_srai_epi16(submod, 4);
3073 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3080 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3083 int x, startx = span->startx, endx = span->endx;
3084 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3085 localcolor = _mm_packs_epi32(localcolor, localcolor);
3086 for (x = startx;x+2 <= endx;x+=2)
3088 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3089 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3090 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3091 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3095 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3096 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3097 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3103 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3106 int x, startx = span->startx, endx = span->endx;
3107 for (x = startx;x+2 <= endx;x+=2)
3109 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3110 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3111 pix1 = _mm_mulhi_epu16(pix1, pix2);
3112 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3116 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3117 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3118 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3124 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3127 int x, startx = span->startx, endx = span->endx;
3128 for (x = startx;x+2 <= endx;x+=2)
3130 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3131 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3132 pix1 = _mm_add_epi16(pix1, pix2);
3133 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3137 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3138 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3139 pix1 = _mm_add_epi16(pix1, pix2);
3140 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3145 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3148 int x, startx = span->startx, endx = span->endx;
3149 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3150 tint = _mm_packs_epi32(tint, tint);
3151 for (x = startx;x+2 <= endx;x+=2)
3153 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3154 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3155 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3156 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3160 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3161 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3162 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3163 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3168 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3171 int x, startx = span->startx, endx = span->endx;
3172 for (x = startx;x+2 <= endx;x+=2)
3174 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3175 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3176 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3177 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3178 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3182 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3183 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3184 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3185 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3186 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3191 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3194 int x, startx = span->startx, endx = span->endx;
3195 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3196 localcolor = _mm_packs_epi32(localcolor, localcolor);
3197 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3198 for (x = startx;x+2 <= endx;x+=2)
3200 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3201 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3202 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3206 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3207 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3208 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3215 void DPSOFTRAST_VertexShader_Generic(void)
3217 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3218 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3219 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3220 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3221 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3224 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3226 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3234 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3235 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3237 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3238 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3241 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3243 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3246 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3248 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3251 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3257 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3262 void DPSOFTRAST_VertexShader_PostProcess(void)
3264 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3265 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3266 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3269 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3271 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3272 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3273 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3275 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3276 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3277 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3279 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3280 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3282 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3283 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3285 // TODO: implement saturation
3287 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3289 // TODO: implement gammaramps
3291 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3296 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3298 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3301 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3303 // this is never called (because colormask is off when this shader is used)
3304 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3305 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3306 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3307 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3308 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3313 void DPSOFTRAST_VertexShader_FlatColor(void)
3315 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3316 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3319 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3322 unsigned char * RESTRICT pixelmask = span->pixelmask;
3323 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3324 int x, startx = span->startx, endx = span->endx;
3325 __m128i Color_Ambientm;
3326 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3327 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3328 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3329 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3330 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3331 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3332 pixel = buffer_FragColorbgra8;
3333 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3334 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3335 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3336 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3337 for (x = startx;x < endx;x++)
3340 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3343 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3344 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3345 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3346 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3352 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3353 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3354 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3356 if (pixel == buffer_FragColorbgra8)
3357 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3363 void DPSOFTRAST_VertexShader_VertexColor(void)
3365 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3366 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3367 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3370 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3373 unsigned char * RESTRICT pixelmask = span->pixelmask;
3374 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3375 int x, startx = span->startx, endx = span->endx;
3376 __m128i Color_Ambientm, Color_Diffusem;
3378 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3379 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3382 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3383 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3384 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3385 pixel = buffer_FragColorbgra8;
3386 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3387 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3388 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3389 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3390 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3391 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3392 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3393 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3394 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3395 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3396 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3397 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3398 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3399 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3401 __m128i color, mod, pix;
3402 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3405 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3406 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3407 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3408 data = _mm_add_ps(data, slope);
3409 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3410 data = _mm_add_ps(data, slope);
3411 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3412 data = _mm_add_ps(data, slope);
3413 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3414 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3415 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3416 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3417 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3418 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3424 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3425 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3426 mod = _mm_packs_epi32(mod, mod);
3427 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3428 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3430 if (pixel == buffer_FragColorbgra8)
3431 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3437 void DPSOFTRAST_VertexShader_Lightmap(void)
3439 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3440 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3441 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3444 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3447 unsigned char * RESTRICT pixelmask = span->pixelmask;
3448 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3449 int x, startx = span->startx, endx = span->endx;
3450 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3451 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3452 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3456 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3457 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3458 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3459 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3460 pixel = buffer_FragColorbgra8;
3461 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3462 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3463 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3464 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3465 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3466 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3467 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3468 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3470 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3471 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3472 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3473 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3474 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3475 for (x = startx;x < endx;x++)
3477 __m128i color, lightmap, glow, pix;
3478 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3481 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3482 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3483 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3484 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3485 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3486 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3487 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3488 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3489 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3490 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3496 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3497 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3498 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3499 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3500 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3501 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3506 for (x = startx;x < endx;x++)
3508 __m128i color, lightmap, pix;
3509 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3512 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3513 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3514 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3515 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3516 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3517 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3518 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3524 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3525 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3526 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3527 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3530 if (pixel == buffer_FragColorbgra8)
3531 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3536 void DPSOFTRAST_VertexShader_LightDirection(void);
3537 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3539 void DPSOFTRAST_VertexShader_FakeLight(void)
3541 DPSOFTRAST_VertexShader_LightDirection();
3544 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3546 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3551 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3553 DPSOFTRAST_VertexShader_LightDirection();
3554 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3557 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3559 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3564 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3566 DPSOFTRAST_VertexShader_LightDirection();
3567 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3570 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3572 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3577 void DPSOFTRAST_VertexShader_LightDirection(void)
3580 int numvertices = dpsoftrast.numvertices;
3582 float LightVector[4];
3583 float EyePosition[4];
3584 float EyeVectorModelSpace[4];
3590 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3591 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3592 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3593 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3594 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3595 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3596 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3597 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3598 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3599 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3600 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3601 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3602 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3603 for (i = 0;i < numvertices;i++)
3605 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3606 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3607 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3608 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3609 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3610 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3611 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3612 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3613 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3614 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3615 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3616 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3617 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3618 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3619 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3620 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3621 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3622 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3623 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3624 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3625 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3626 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3627 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3628 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3629 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3632 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3633 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3635 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3638 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3639 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3640 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3641 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3642 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3643 #define DPSOFTRAST_Vector3Normalize(v)\
3646 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3657 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3659 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3660 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669 int x, startx = span->startx, endx = span->endx;
3670 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3671 float LightVectordata[4];
3672 float LightVectorslope[4];
3673 float EyeVectordata[4];
3674 float EyeVectorslope[4];
3675 float VectorSdata[4];
3676 float VectorSslope[4];
3677 float VectorTdata[4];
3678 float VectorTslope[4];
3679 float VectorRdata[4];
3680 float VectorRslope[4];
3682 float diffusetex[4];
3684 float surfacenormal[4];
3685 float lightnormal[4];
3686 float lightnormal_modelspace[4];
3688 float specularnormal[4];
3691 float SpecularPower;
3693 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3694 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3695 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3696 Color_Glow[3] = 0.0f;
3697 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3698 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3699 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3700 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3701 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3702 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3703 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3704 Color_Pants[3] = 0.0f;
3705 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3706 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3707 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3708 Color_Shirt[3] = 0.0f;
3709 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3710 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3711 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3713 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3714 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3718 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3722 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3723 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3724 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3725 Color_Diffuse[3] = 0.0f;
3726 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3727 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3728 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3729 LightColor[3] = 0.0f;
3730 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3731 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3732 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3733 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3734 Color_Specular[3] = 0.0f;
3735 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3736 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3737 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3739 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3741 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3742 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3743 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3744 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3745 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3747 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3749 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3752 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3754 // nothing of this needed
3758 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3761 for (x = startx;x < endx;x++)
3764 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3765 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3766 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3767 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3768 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3770 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3771 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3772 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3773 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3775 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3776 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3777 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3778 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3779 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3780 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3781 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3782 DPSOFTRAST_Vector3Normalize(surfacenormal);
3784 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3786 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3787 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3788 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3789 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3791 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3792 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3793 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3794 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3796 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3797 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3798 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3799 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3801 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3802 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3803 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3804 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3806 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3807 DPSOFTRAST_Vector3Normalize(lightnormal);
3809 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3811 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3812 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3813 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3814 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3817 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3819 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3820 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3821 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3823 float f = 1.0f / 256.0f;
3824 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3825 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3826 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3829 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3831 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3832 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3833 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3834 DPSOFTRAST_Vector3Normalize(lightnormal);
3836 LightColor[0] = 1.0;
3837 LightColor[1] = 1.0;
3838 LightColor[2] = 1.0;
3842 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3843 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3844 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3845 DPSOFTRAST_Vector3Normalize(lightnormal);
3848 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3850 if(thread->shader_exactspecularmath)
3852 // reflect lightnormal at surfacenormal, take the negative of that
3853 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3855 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3856 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3857 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3858 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3860 // dot of this and normalize(EyeVectorFogDepth.xyz)
3861 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3862 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3863 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3864 DPSOFTRAST_Vector3Normalize(eyenormal);
3866 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3870 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3871 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3872 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3873 DPSOFTRAST_Vector3Normalize(eyenormal);
3875 specularnormal[0] = lightnormal[0] + eyenormal[0];
3876 specularnormal[1] = lightnormal[1] + eyenormal[1];
3877 specularnormal[2] = lightnormal[2] + eyenormal[2];
3878 DPSOFTRAST_Vector3Normalize(specularnormal);
3880 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3883 specular = pow(specular, SpecularPower * glosstex[3]);
3884 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3886 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3887 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3888 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3889 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3893 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3894 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3895 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3896 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3899 buffer_FragColorbgra8[x*4+0] = d[0];
3900 buffer_FragColorbgra8[x*4+1] = d[1];
3901 buffer_FragColorbgra8[x*4+2] = d[2];
3902 buffer_FragColorbgra8[x*4+3] = d[3];
3905 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3907 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3908 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3909 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3910 Color_Diffuse[3] = 0.0f;
3911 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3912 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3913 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3914 LightColor[3] = 0.0f;
3915 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3919 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3920 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3921 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3922 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3923 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3925 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3927 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3930 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3932 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3936 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3939 for (x = startx;x < endx;x++)
3942 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3943 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3944 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3945 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3946 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3947 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3948 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3949 DPSOFTRAST_Vector3Normalize(surfacenormal);
3951 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3953 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3954 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3955 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3956 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3958 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3959 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3960 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3961 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3963 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3964 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3965 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3966 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3968 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3969 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3970 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3971 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3973 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3974 DPSOFTRAST_Vector3Normalize(lightnormal);
3976 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3978 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3979 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3980 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3981 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3984 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3986 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3987 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3988 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3990 float f = 1.0f / 256.0f;
3991 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3992 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3993 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3996 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3998 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3999 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4000 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4001 DPSOFTRAST_Vector3Normalize(lightnormal);
4003 LightColor[0] = 1.0;
4004 LightColor[1] = 1.0;
4005 LightColor[2] = 1.0;
4009 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4010 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4011 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4012 DPSOFTRAST_Vector3Normalize(lightnormal);
4015 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4016 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4018 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4019 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4020 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4021 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4025 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4026 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4027 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4028 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4030 buffer_FragColorbgra8[x*4+0] = d[0];
4031 buffer_FragColorbgra8[x*4+1] = d[1];
4032 buffer_FragColorbgra8[x*4+2] = d[2];
4033 buffer_FragColorbgra8[x*4+3] = d[3];
4038 for (x = startx;x < endx;x++)
4041 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4042 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4043 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4044 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4046 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4048 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4049 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4050 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4051 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4055 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4056 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4057 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4058 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4060 buffer_FragColorbgra8[x*4+0] = d[0];
4061 buffer_FragColorbgra8[x*4+1] = d[1];
4062 buffer_FragColorbgra8[x*4+2] = d[2];
4063 buffer_FragColorbgra8[x*4+3] = d[3];
4066 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4071 void DPSOFTRAST_VertexShader_LightSource(void)
4074 int numvertices = dpsoftrast.numvertices;
4075 float LightPosition[4];
4076 float LightVector[4];
4077 float LightVectorModelSpace[4];
4078 float EyePosition[4];
4079 float EyeVectorModelSpace[4];
4085 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4086 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4087 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4088 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4089 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4090 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4091 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4092 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4093 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4094 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4095 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4096 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4097 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4098 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4099 for (i = 0;i < numvertices;i++)
4101 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4102 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4103 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4104 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4105 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4106 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4107 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4108 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4109 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4110 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4111 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4112 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4113 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4114 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4115 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4116 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4117 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4118 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4119 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4120 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4121 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4122 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4123 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4124 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4125 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4126 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4127 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4128 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4131 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4132 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4134 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4135 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4138 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4141 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4142 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 int x, startx = span->startx, endx = span->endx;
4150 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4151 float CubeVectordata[4];
4152 float CubeVectorslope[4];
4153 float LightVectordata[4];
4154 float LightVectorslope[4];
4155 float EyeVectordata[4];
4156 float EyeVectorslope[4];
4158 float diffusetex[4];
4160 float surfacenormal[4];
4161 float lightnormal[4];
4163 float specularnormal[4];
4166 float SpecularPower;
4167 float CubeVector[4];
4170 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4171 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4172 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4173 Color_Glow[3] = 0.0f;
4174 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4175 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4176 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4177 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4178 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4179 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4180 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4181 Color_Diffuse[3] = 0.0f;
4182 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4183 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4184 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4185 Color_Specular[3] = 0.0f;
4186 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4187 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4188 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4189 Color_Pants[3] = 0.0f;
4190 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4191 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4192 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4193 Color_Shirt[3] = 0.0f;
4194 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4195 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4196 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4197 LightColor[3] = 0.0f;
4198 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4199 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4200 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4201 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4203 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4204 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4205 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4207 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4208 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4210 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4211 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4212 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4214 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4216 for (x = startx;x < endx;x++)
4219 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4220 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4221 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4222 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4223 if (attenuation < 0.01f)
4225 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4227 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4228 if (attenuation < 0.01f)
4232 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4233 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4234 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4235 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4236 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4238 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4239 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4240 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4241 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4243 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4244 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4245 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4246 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4247 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4248 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4249 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4250 DPSOFTRAST_Vector3Normalize(surfacenormal);
4252 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4253 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4254 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4255 DPSOFTRAST_Vector3Normalize(lightnormal);
4257 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4259 if(thread->shader_exactspecularmath)
4261 // reflect lightnormal at surfacenormal, take the negative of that
4262 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4264 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4265 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4266 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4267 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4269 // dot of this and normalize(EyeVectorFogDepth.xyz)
4270 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4271 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4272 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4273 DPSOFTRAST_Vector3Normalize(eyenormal);
4275 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4279 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4280 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4281 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4282 DPSOFTRAST_Vector3Normalize(eyenormal);
4284 specularnormal[0] = lightnormal[0] + eyenormal[0];
4285 specularnormal[1] = lightnormal[1] + eyenormal[1];
4286 specularnormal[2] = lightnormal[2] + eyenormal[2];
4287 DPSOFTRAST_Vector3Normalize(specularnormal);
4289 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4291 specular = pow(specular, SpecularPower * glosstex[3]);
4293 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4295 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4296 attenuation *= (1.0f / 255.0f);
4297 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4298 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4299 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4300 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4304 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4305 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4306 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4307 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4309 buffer_FragColorbgra8[x*4+0] = d[0];
4310 buffer_FragColorbgra8[x*4+1] = d[1];
4311 buffer_FragColorbgra8[x*4+2] = d[2];
4312 buffer_FragColorbgra8[x*4+3] = d[3];
4315 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4317 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4318 for (x = startx;x < endx;x++)
4321 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4322 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4323 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4324 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4325 if (attenuation < 0.01f)
4327 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4329 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4330 if (attenuation < 0.01f)
4334 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4335 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4336 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4337 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4338 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4340 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4341 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4342 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4343 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4345 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4346 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4347 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4348 DPSOFTRAST_Vector3Normalize(surfacenormal);
4350 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4351 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4352 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4353 DPSOFTRAST_Vector3Normalize(lightnormal);
4355 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4356 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4358 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4359 attenuation *= (1.0f / 255.0f);
4360 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4361 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4362 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4363 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4367 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4368 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4369 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4370 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4372 buffer_FragColorbgra8[x*4+0] = d[0];
4373 buffer_FragColorbgra8[x*4+1] = d[1];
4374 buffer_FragColorbgra8[x*4+2] = d[2];
4375 buffer_FragColorbgra8[x*4+3] = d[3];
4380 for (x = startx;x < endx;x++)
4383 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4384 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4385 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4386 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4387 if (attenuation < 0.01f)
4389 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4391 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4392 if (attenuation < 0.01f)
4396 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4397 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4398 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4399 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4400 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4402 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4403 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4404 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4405 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4407 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4409 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4410 attenuation *= (1.0f / 255.0f);
4411 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4412 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4413 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4414 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4418 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4419 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4420 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4421 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4423 buffer_FragColorbgra8[x*4+0] = d[0];
4424 buffer_FragColorbgra8[x*4+1] = d[1];
4425 buffer_FragColorbgra8[x*4+2] = d[2];
4426 buffer_FragColorbgra8[x*4+3] = d[3];
4429 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4435 void DPSOFTRAST_VertexShader_Refraction(void)
4437 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4438 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4439 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4442 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4444 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4446 int x, startx = span->startx, endx = span->endx;
4449 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4450 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4453 float ModelViewProjectionPositiondata[4];
4454 float ModelViewProjectionPositionslope[4];
4457 float ScreenScaleRefractReflect[2];
4458 float ScreenCenterRefractReflect[2];
4459 float DistortScaleRefractReflect[2];
4460 float RefractColor[4];
4462 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4463 if(!texture) return;
4466 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4467 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4470 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4473 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4474 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4475 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4476 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4477 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4478 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4479 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4480 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4481 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4482 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4485 for (x = startx;x < endx;x++)
4487 float SafeScreenTexCoord[2];
4488 float ScreenTexCoord[2];
4495 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4496 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4498 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4499 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4500 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4502 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4503 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4504 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4505 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4506 DPSOFTRAST_Vector3Normalize(v);
4507 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4508 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4510 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4511 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4513 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4514 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4515 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4516 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4519 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4524 void DPSOFTRAST_VertexShader_Water(void)
4527 int numvertices = dpsoftrast.numvertices;
4528 float EyePosition[4];
4529 float EyeVectorModelSpace[4];
4535 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4536 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4537 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4538 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4539 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4540 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4541 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4542 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4543 for (i = 0;i < numvertices;i++)
4545 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4546 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4547 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4548 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4549 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4550 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4551 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4552 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4553 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4554 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4555 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4556 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4557 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4558 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4559 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4560 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4561 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4562 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4563 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4564 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4565 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4566 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4568 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4569 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4570 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4574 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4576 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4578 int x, startx = span->startx, endx = span->endx;
4581 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4582 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4585 float ModelViewProjectionPositiondata[4];
4586 float ModelViewProjectionPositionslope[4];
4587 float EyeVectordata[4];
4588 float EyeVectorslope[4];
4591 float ScreenScaleRefractReflect[2];
4592 float ScreenCenterRefractReflect[2];
4593 float DistortScaleRefractReflect[2];
4594 float RefractColor[4];
4595 float ReflectColor[4];
4596 float ReflectFactor;
4597 float ReflectOffset;
4599 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4600 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4601 if(!texture_refraction || !texture_reflection) return;
4604 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4605 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4608 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4609 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4612 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4613 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4614 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4615 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4616 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4617 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4618 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4619 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4620 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4621 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4622 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4623 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4624 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4625 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4626 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4627 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4628 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4629 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4630 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4631 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4632 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4633 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4636 for (x = startx;x < endx;x++)
4638 float SafeScreenTexCoord[4];
4639 float ScreenTexCoord[4];
4642 unsigned char c1[4];
4643 unsigned char c2[4];
4648 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4649 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4651 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4652 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4653 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4654 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4655 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4657 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4658 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4659 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4660 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4661 DPSOFTRAST_Vector3Normalize(v);
4662 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4663 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4664 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4665 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4667 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4668 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4669 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4670 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4671 DPSOFTRAST_Vector3Normalize(v);
4672 Fresnel = 1.0f - v[2];
4673 Fresnel = min(1.0f, Fresnel);
4674 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4676 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4677 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4678 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4679 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4681 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4682 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4683 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4684 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4687 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4692 void DPSOFTRAST_VertexShader_ShowDepth(void)
4694 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4697 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4700 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4701 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4702 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4703 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4704 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4709 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4711 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4714 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4717 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4718 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4719 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4720 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4721 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4726 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4728 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4731 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4734 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4735 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4736 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4737 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4738 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4743 typedef struct DPSOFTRAST_ShaderModeInfo_s
4746 void (*Vertex)(void);
4747 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4748 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4749 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4751 DPSOFTRAST_ShaderModeInfo;
4753 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4755 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4756 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4757 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4758 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4759 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4760 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4761 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4762 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4763 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4764 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4765 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4766 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4767 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4768 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4769 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4770 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4773 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4778 unsigned int *depthpixel;
4782 unsigned char *pixelmask;
4783 DPSOFTRAST_State_Triangle *triangle;
4784 triangle = &thread->triangles[span->triangle];
4785 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4786 startx = span->startx;
4788 depth = span->depthbase;
4789 depthslope = span->depthslope;
4790 pixelmask = thread->pixelmaskarray;
4791 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4793 switch(thread->fb_depthfunc)
4796 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4797 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4798 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4799 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4800 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4801 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4802 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4804 while (startx < endx && !pixelmask[startx])
4806 while (endx > startx && !pixelmask[endx-1])
4811 // no depth testing means we're just dealing with color...
4812 memset(pixelmask + startx, 1, endx - startx);
4814 span->pixelmask = pixelmask;
4815 span->startx = startx;
4819 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4821 int x, d, depth, depthslope, startx, endx;
4822 const unsigned char *pixelmask;
4823 unsigned int *depthpixel;
4824 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4826 depth = span->depthbase;
4827 depthslope = span->depthslope;
4828 pixelmask = span->pixelmask;
4829 startx = span->startx;
4831 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4832 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4838 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4841 DPSOFTRAST_State_Triangle *triangle;
4842 DPSOFTRAST_State_Span *span;
4843 for (i = 0; i < thread->numspans; i++)
4845 span = &thread->spans[i];
4846 triangle = &thread->triangles[span->triangle];
4847 DPSOFTRAST_Draw_DepthTest(thread, span);
4848 if (span->startx >= span->endx)
4850 // run pixel shader if appropriate
4851 // do this before running depthmask code, to allow the pixelshader
4852 // to clear pixelmask values for alpha testing
4853 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4854 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4855 DPSOFTRAST_Draw_DepthWrite(thread, span);
4857 thread->numspans = 0;
4860 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4862 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4865 int cullface = thread->cullface;
4866 int minx, maxx, miny, maxy;
4867 int miny1, maxy1, miny2, maxy2;
4868 __m128i fbmin, fbmax;
4869 __m128 viewportcenter, viewportscale;
4870 int firstvertex = command->firstvertex;
4871 int numvertices = command->numvertices;
4872 int numtriangles = command->numtriangles;
4873 const int *element3i = command->element3i;
4874 const unsigned short *element3s = command->element3s;
4875 int clipped = command->clipped;
4882 int starty, endy, bandy;
4886 float clip0origin, clip0slope;
4888 __m128 triangleedge1, triangleedge2, trianglenormal;
4891 DPSOFTRAST_State_Triangle *triangle;
4892 DPSOFTRAST_Texture *texture;
4893 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4894 miny = thread->fb_scissor[1];
4895 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4896 miny1 = bound(miny, thread->miny1, maxy);
4897 maxy1 = bound(miny, thread->maxy1, maxy);
4898 miny2 = bound(miny, thread->miny2, maxy);
4899 maxy2 = bound(miny, thread->maxy2, maxy);
4900 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4902 if (!ATOMIC_DECREMENT(command->refcount))
4904 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4905 MM_FREE(command->arrays);
4909 minx = thread->fb_scissor[0];
4910 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4911 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4912 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4913 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4914 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4915 screen[3] = _mm_setzero_ps();
4916 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4917 for (i = 0;i < numtriangles;i++)
4919 const float *screencoord4f = command->arrays;
4920 const float *arrays = screencoord4f + numvertices*4;
4922 // generate the 3 edges of this triangle
4923 // generate spans for the triangle - switch based on left split or right split classification of triangle
4926 e[0] = element3s[i*3+0] - firstvertex;
4927 e[1] = element3s[i*3+1] - firstvertex;
4928 e[2] = element3s[i*3+2] - firstvertex;
4932 e[0] = element3i[i*3+0] - firstvertex;
4933 e[1] = element3i[i*3+1] - firstvertex;
4934 e[2] = element3i[i*3+2] - firstvertex;
4943 #define SKIPBACKFACE \
4944 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4945 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4946 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4947 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4948 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4952 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4956 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4961 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4962 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4964 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4965 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4967 #define CLIPPEDVERTEXCOPY(k,p1) \
4968 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4970 #define GENATTRIBCOPY(attrib, p1) \
4971 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4972 #define GENATTRIBLERP(attrib, p1, p2) \
4974 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4977 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4981 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4982 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4983 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4985 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4986 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4987 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4993 // calculate distance from nearplane
4994 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4995 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4996 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4997 if (clipdist[0] >= 0.0f)
4999 if (clipdist[1] >= 0.0f)
5001 if (clipdist[2] >= 0.0f)
5004 // triangle is entirely in front of nearplane
5005 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5012 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5020 if (clipdist[2] >= 0.0f)
5022 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5029 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5036 else if (clipdist[1] >= 0.0f)
5038 if (clipdist[2] >= 0.0f)
5040 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5047 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5053 else if (clipdist[2] >= 0.0f)
5055 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5060 else continue; // triangle is entirely behind nearplane
5063 // calculate integer y coords for triangle points
5064 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5065 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5066 screenmin = _mm_min_epi16(screeni, screenir),
5067 screenmax = _mm_max_epi16(screeni, screenir);
5068 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5069 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5070 screenmin = _mm_max_epi16(screenmin, fbmin);
5071 screenmax = _mm_min_epi16(screenmax, fbmax);
5072 // skip offscreen triangles
5073 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5075 starty = _mm_extract_epi16(screenmin, 1);
5076 endy = _mm_extract_epi16(screenmax, 1)+1;
5077 if (starty >= maxy1 && endy <= miny2)
5079 screeny = _mm_srai_epi32(screeni, 16);
5082 triangle = &thread->triangles[thread->numtriangles];
5084 // calculate attribute plans for triangle data...
5085 // okay, this triangle is going to produce spans, we'd better project
5086 // the interpolants now (this is what gives perspective texturing),
5087 // this consists of simply multiplying all arrays by the W coord
5088 // (which is basically 1/Z), which will be undone per-pixel
5089 // (multiplying by Z again) to get the perspective-correct array
5092 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5093 __m128 mipedgescale, mipdensity;
5094 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5095 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5096 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5097 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5098 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5099 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5100 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5101 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5102 attribedge1 = _mm_sub_ss(w0, w1);
5103 attribedge2 = _mm_sub_ss(w2, w1);
5104 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5107 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5108 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5109 _mm_store_ss(&triangle->w[0], attribxslope);
5110 _mm_store_ss(&triangle->w[1], attribyslope);
5111 _mm_store_ss(&triangle->w[2], attriborigin);
5116 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5118 float cliporigin, clipxslope, clipyslope;
5119 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5120 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5121 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5123 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5124 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5125 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5126 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5127 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5130 clip0origin = -cliporigin/clipxslope;
5131 clip0slope = -clipyslope/clipxslope;
5132 clip0dir = clipxslope > 0 ? 1 : -1;
5134 else if(clipyslope > 0)
5136 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5137 clip0slope = dpsoftrast.fb_width;
5140 else if(clipyslope < 0)
5142 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5143 clip0slope = -dpsoftrast.fb_width;
5146 else if(clip0origin < 0) continue;
5149 mipedgescale = _mm_setzero_ps();
5150 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5152 __m128 attrib0, attrib1, attrib2;
5153 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5154 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5156 arrays += numvertices*4;
5157 GENATTRIBS(attrib0, attrib1, attrib2);
5158 attriborigin = _mm_mul_ps(attrib1, w1);
5159 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5160 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5161 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5162 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5163 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5164 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5165 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5166 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5167 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5169 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5170 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5171 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5172 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5176 memset(triangle->mip, 0, sizeof(triangle->mip));
5177 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5179 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5180 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5182 texture = thread->texbound[texunit];
5183 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5185 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5186 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5187 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5188 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5189 // this will be multiplied in the texturing routine by the texture resolution
5190 y = _mm_cvtss_si32(mipdensity);
5193 y = (int)(log((float)y)*0.5f/M_LN2);
5194 if (y > texture->mipmaps - 1)
5195 y = texture->mipmaps - 1;
5196 triangle->mip[texunit] = y;
5202 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5205 __m128 xcoords, xslope;
5206 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5207 int yccmask = _mm_movemask_epi8(ycc);
5208 int edge0p, edge0n, edge1p, edge1n;
5217 case 0xFFFF: /*0000*/ y = endy; continue;
5218 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5219 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5220 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5221 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5222 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5223 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5224 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5225 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5226 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5227 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5228 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5229 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5230 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5231 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5232 case 0x0000: /*1111*/ y++; continue;
5240 case 0xFFFF: /*000*/ y = endy; continue;
5241 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5242 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5243 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5244 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5245 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5247 case 0x0000: /*111*/ y++; continue;
5250 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5251 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5252 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5253 nexty = _mm_extract_epi16(ycc, 0);
5254 if (nexty >= bandy) nexty = bandy-1;
5255 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5256 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5257 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5258 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5259 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5260 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5262 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5263 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5265 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5266 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5268 int startx, endx, offset;
5269 startx = _mm_cvtss_si32(xcoords);
5270 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5271 if (startx < minx) startx = minx;
5272 if (endx > maxx) endx = maxx;
5273 if (startx >= endx) continue;
5281 if(endx <= clip0) continue;
5282 startx = (int)clip0;
5285 else if (endx > clip0)
5287 if(startx >= clip0) continue;
5292 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5294 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5295 span->triangle = thread->numtriangles;
5299 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5300 if (span->startx >= span->endx)
5302 wslope = triangle->w[0];
5303 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5304 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5305 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5306 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5307 DPSOFTRAST_Draw_ProcessSpans(thread);
5312 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5314 DPSOFTRAST_Draw_ProcessSpans(thread);
5315 thread->numtriangles = 0;
5319 if (!ATOMIC_DECREMENT(command->refcount))
5321 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5322 MM_FREE(command->arrays);
5325 if (thread->numspans > 0 || thread->numtriangles > 0)
5327 DPSOFTRAST_Draw_ProcessSpans(thread);
5328 thread->numtriangles = 0;
5333 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5337 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5338 int datasize = 2*numvertices*sizeof(float[4]);
5339 DPSOFTRAST_Command_Draw *command;
5340 unsigned char *data;
5341 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5343 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5344 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5346 datasize += numvertices*sizeof(float[4]);
5349 datasize += numtriangles*sizeof(unsigned short[3]);
5351 datasize += numtriangles*sizeof(int[3]);
5352 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5353 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5355 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5356 data = (unsigned char *)MM_CALLOC(datasize, 1);
5360 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5361 data = (unsigned char *)command + commandsize;
5363 command->firstvertex = firstvertex;
5364 command->numvertices = numvertices;
5365 command->numtriangles = numtriangles;
5366 command->arrays = (float *)data;
5367 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5368 dpsoftrast.firstvertex = firstvertex;
5369 dpsoftrast.numvertices = numvertices;
5370 dpsoftrast.screencoord4f = (float *)data;
5371 data += numvertices*sizeof(float[4]);
5372 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5373 data += numvertices*sizeof(float[4]);
5374 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5376 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5377 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5379 dpsoftrast.post_array4f[j] = (float *)data;
5380 data += numvertices*sizeof(float[4]);
5382 command->element3i = NULL;
5383 command->element3s = NULL;
5386 command->element3s = (unsigned short *)data;
5387 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5391 command->element3i = (int *)data;
5392 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5397 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5399 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5400 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5401 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5402 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5403 if (command->starty >= command->endy)
5405 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5406 MM_FREE(command->arrays);
5407 DPSOFTRAST_UndoCommand(command->commandsize);
5410 command->clipped = dpsoftrast.drawclipped;
5411 command->refcount = dpsoftrast.numthreads;
5413 if (dpsoftrast.usethreads)
5416 DPSOFTRAST_Draw_SyncCommands();
5417 for (i = 0; i < dpsoftrast.numthreads; i++)
5419 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5420 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5421 Thread_CondSignal(thread->drawcond);
5426 DPSOFTRAST_Draw_FlushThreads();
5430 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5431 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5433 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5435 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5437 DPSOFTRAST_Command_SetRenderTargets *command;
5438 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5439 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5440 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5442 dpsoftrast.fb_width = width;
5443 dpsoftrast.fb_height = height;
5444 dpsoftrast.fb_depthpixels = depthpixels;
5445 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5446 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5447 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5448 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5449 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5450 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5451 command->width = width;
5452 command->height = height;
5455 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5457 int commandoffset = thread->commandoffset;
5458 while (commandoffset != endoffset)
5460 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5461 switch (command->opcode)
5463 #define INTERPCOMMAND(name) \
5464 case DPSOFTRAST_OPCODE_##name : \
5465 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5466 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5467 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5468 commandoffset = 0; \
5470 INTERPCOMMAND(Viewport)
5471 INTERPCOMMAND(ClearColor)
5472 INTERPCOMMAND(ClearDepth)
5473 INTERPCOMMAND(ColorMask)
5474 INTERPCOMMAND(DepthTest)
5475 INTERPCOMMAND(ScissorTest)
5476 INTERPCOMMAND(Scissor)
5477 INTERPCOMMAND(BlendFunc)
5478 INTERPCOMMAND(BlendSubtract)
5479 INTERPCOMMAND(DepthMask)
5480 INTERPCOMMAND(DepthFunc)
5481 INTERPCOMMAND(DepthRange)
5482 INTERPCOMMAND(PolygonOffset)
5483 INTERPCOMMAND(CullFace)
5484 INTERPCOMMAND(AlphaTest)
5485 INTERPCOMMAND(AlphaFunc)
5486 INTERPCOMMAND(SetTexture)
5487 INTERPCOMMAND(SetShader)
5488 INTERPCOMMAND(Uniform4f)
5489 INTERPCOMMAND(UniformMatrix4f)
5490 INTERPCOMMAND(Uniform1i)
5491 INTERPCOMMAND(SetRenderTargets)
5492 INTERPCOMMAND(ClipPlane)
5494 case DPSOFTRAST_OPCODE_Draw:
5495 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5496 commandoffset += command->commandsize;
5497 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5499 thread->commandoffset = commandoffset;
5502 case DPSOFTRAST_OPCODE_Reset:
5507 thread->commandoffset = commandoffset;
5510 static int DPSOFTRAST_Draw_Thread(void *data)
5512 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5513 while(thread->index >= 0)
5515 if (thread->commandoffset != dpsoftrast.drawcommand)
5517 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5521 Thread_LockMutex(thread->drawmutex);
5522 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5524 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5525 thread->starving = true;
5526 Thread_CondWait(thread->drawcond, thread->drawmutex);
5527 thread->starving = false;
5529 Thread_UnlockMutex(thread->drawmutex);
5535 static void DPSOFTRAST_Draw_FlushThreads(void)
5537 DPSOFTRAST_State_Thread *thread;
5539 DPSOFTRAST_Draw_SyncCommands();
5540 if (dpsoftrast.usethreads)
5542 for (i = 0; i < dpsoftrast.numthreads; i++)
5544 thread = &dpsoftrast.threads[i];
5545 if (thread->commandoffset != dpsoftrast.drawcommand)
5547 Thread_LockMutex(thread->drawmutex);
5548 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5549 Thread_CondSignal(thread->drawcond);
5550 Thread_UnlockMutex(thread->drawmutex);
5553 for (i = 0; i < dpsoftrast.numthreads; i++)
5555 thread = &dpsoftrast.threads[i];
5556 if (thread->commandoffset != dpsoftrast.drawcommand)
5558 Thread_LockMutex(thread->drawmutex);
5559 if (thread->commandoffset != dpsoftrast.drawcommand)
5561 thread->waiting = true;
5562 Thread_CondWait(thread->waitcond, thread->drawmutex);
5563 thread->waiting = false;
5565 Thread_UnlockMutex(thread->drawmutex);
5571 for (i = 0; i < dpsoftrast.numthreads; i++)
5573 thread = &dpsoftrast.threads[i];
5574 if (thread->commandoffset != dpsoftrast.drawcommand)
5575 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5578 dpsoftrast.commandpool.usedcommands = 0;
5581 void DPSOFTRAST_Flush(void)
5583 DPSOFTRAST_Draw_FlushThreads();
5586 void DPSOFTRAST_Finish(void)
5591 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5601 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5602 dpsoftrast.bigendian = u.b[3];
5603 dpsoftrast.fb_width = width;
5604 dpsoftrast.fb_height = height;
5605 dpsoftrast.fb_depthpixels = depthpixels;
5606 dpsoftrast.fb_colorpixels[0] = colorpixels;
5607 dpsoftrast.fb_colorpixels[1] = NULL;
5608 dpsoftrast.fb_colorpixels[1] = NULL;
5609 dpsoftrast.fb_colorpixels[1] = NULL;
5610 dpsoftrast.viewport[0] = 0;
5611 dpsoftrast.viewport[1] = 0;
5612 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5613 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5614 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5615 dpsoftrast.texture_firstfree = 1;
5616 dpsoftrast.texture_end = 1;
5617 dpsoftrast.texture_max = 0;
5618 dpsoftrast.color[0] = 1;
5619 dpsoftrast.color[1] = 1;
5620 dpsoftrast.color[2] = 1;
5621 dpsoftrast.color[3] = 1;
5622 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5623 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5624 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5625 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5626 for (i = 0; i < dpsoftrast.numthreads; i++)
5628 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5630 thread->cullface = GL_BACK;
5631 thread->colormask[0] = 1;
5632 thread->colormask[1] = 1;
5633 thread->colormask[2] = 1;
5634 thread->colormask[3] = 1;
5635 thread->blendfunc[0] = GL_ONE;
5636 thread->blendfunc[1] = GL_ZERO;
5637 thread->depthmask = true;
5638 thread->depthtest = true;
5639 thread->depthfunc = GL_LEQUAL;
5640 thread->scissortest = false;
5641 thread->alphatest = false;
5642 thread->alphafunc = GL_GREATER;
5643 thread->alphavalue = 0.5f;
5644 thread->viewport[0] = 0;
5645 thread->viewport[1] = 0;
5646 thread->viewport[2] = dpsoftrast.fb_width;
5647 thread->viewport[3] = dpsoftrast.fb_height;
5648 thread->scissor[0] = 0;
5649 thread->scissor[1] = 0;
5650 thread->scissor[2] = dpsoftrast.fb_width;
5651 thread->scissor[3] = dpsoftrast.fb_height;
5652 thread->depthrange[0] = 0;
5653 thread->depthrange[1] = 1;
5654 thread->polygonoffset[0] = 0;
5655 thread->polygonoffset[1] = 0;
5656 thread->clipplane[0] = 0;
5657 thread->clipplane[1] = 0;
5658 thread->clipplane[2] = 0;
5659 thread->clipplane[3] = 1;
5661 thread->numspans = 0;
5662 thread->numtriangles = 0;
5663 thread->commandoffset = 0;
5664 thread->waiting = false;
5665 thread->starving = false;
5667 thread->validate = -1;
5668 DPSOFTRAST_Validate(thread, -1);
5670 if (dpsoftrast.usethreads)
5672 thread->waitcond = Thread_CreateCond();
5673 thread->drawcond = Thread_CreateCond();
5674 thread->drawmutex = Thread_CreateMutex();
5675 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5681 void DPSOFTRAST_Shutdown(void)
5684 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5686 DPSOFTRAST_State_Thread *thread;
5687 for (i = 0; i < dpsoftrast.numthreads; i++)
5689 thread = &dpsoftrast.threads[i];
5690 Thread_LockMutex(thread->drawmutex);
5692 Thread_CondSignal(thread->drawcond);
5693 Thread_UnlockMutex(thread->drawmutex);
5694 Thread_WaitThread(thread->thread, 0);
5695 Thread_DestroyCond(thread->waitcond);
5696 Thread_DestroyCond(thread->drawcond);
5697 Thread_DestroyMutex(thread->drawmutex);
5700 for (i = 0;i < dpsoftrast.texture_end;i++)
5701 if (dpsoftrast.texture[i].bytes)
5702 MM_FREE(dpsoftrast.texture[i].bytes);
5703 if (dpsoftrast.texture)
5704 free(dpsoftrast.texture);
5705 if (dpsoftrast.threads)
5706 MM_FREE(dpsoftrast.threads);
5707 memset(&dpsoftrast, 0, sizeof(dpsoftrast));