3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
239 ALIGN(float fb_clipplane[4]);
242 int shader_permutation;
243 int shader_exactspecularmath;
245 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
247 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
248 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
250 // DPSOFTRAST_VALIDATE_ flags
253 // derived values (DPSOFTRAST_VALIDATE_FB)
256 ALIGN(float fb_viewportcenter[4]);
257 ALIGN(float fb_viewportscale[4]);
259 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
262 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
271 ATOMIC(volatile int commandoffset);
273 volatile bool waiting;
274 volatile bool starving;
281 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
282 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
284 DPSOFTRAST_State_Thread);
286 typedef ATOMIC(struct DPSOFTRAST_State_s
290 unsigned int *fb_depthpixels;
291 unsigned int *fb_colorpixels[4];
294 ALIGN(float fb_viewportcenter[4]);
295 ALIGN(float fb_viewportscale[4]);
298 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
299 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
301 const float *pointer_vertex3f;
302 const float *pointer_color4f;
303 const unsigned char *pointer_color4ub;
304 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
309 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
313 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
314 float *screencoord4f;
320 int shader_permutation;
321 int shader_exactspecularmath;
325 int texture_firstfree;
326 DPSOFTRAST_Texture *texture;
331 const char *errorstring;
336 DPSOFTRAST_State_Thread *threads;
338 ATOMIC(volatile int drawcommand);
340 DPSOFTRAST_State_Command_Pool commandpool;
344 DPSOFTRAST_State dpsoftrast;
346 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
347 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
348 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
349 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
350 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
352 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
354 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
355 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
356 fb_viewportcenter[3] = 0.5f;
357 fb_viewportcenter[0] = 0.0f;
358 fb_viewportscale[1] = 0.5f * viewport[2];
359 fb_viewportscale[2] = -0.5f * viewport[3];
360 fb_viewportscale[3] = 0.5f;
361 fb_viewportscale[0] = 1.0f;
364 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
366 if (dpsoftrast.interlace)
368 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
371 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
376 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
380 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
382 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
383 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
384 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
385 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
386 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
389 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
391 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
392 // and viewport projection values
395 x1 = thread->scissor[0];
396 x2 = thread->scissor[0] + thread->scissor[2];
397 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
398 y2 = dpsoftrast.fb_height - thread->scissor[1];
399 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
401 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
403 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
404 thread->fb_scissor[0] = x1;
405 thread->fb_scissor[1] = y1;
406 thread->fb_scissor[2] = x2 - x1;
407 thread->fb_scissor[3] = y2 - y1;
409 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
410 DPSOFTRAST_RecalcClipPlane(thread);
411 DPSOFTRAST_RecalcThread(thread);
414 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
416 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
419 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
421 if (thread->blendsubtract)
423 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
425 #define BLENDFUNC(sfactor, dfactor, blendmode) \
426 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
427 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
428 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
433 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
435 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
436 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
437 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
438 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
439 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
440 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
441 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
442 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
443 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
444 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
445 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
452 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
454 mask &= thread->validate;
457 if (mask & DPSOFTRAST_VALIDATE_FB)
459 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
460 DPSOFTRAST_RecalcFB(thread);
462 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
464 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
465 DPSOFTRAST_RecalcDepthFunc(thread);
467 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
469 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
470 DPSOFTRAST_RecalcBlendFunc(thread);
474 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
476 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
477 return &dpsoftrast.texture[index];
481 static void DPSOFTRAST_Texture_Grow(void)
483 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
484 DPSOFTRAST_State_Thread *thread;
488 // expand texture array as needed
489 if (dpsoftrast.texture_max < 1024)
490 dpsoftrast.texture_max = 1024;
492 dpsoftrast.texture_max *= 2;
493 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
494 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
495 if (dpsoftrast.texbound[i])
496 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
497 for (j = 0; j < dpsoftrast.numthreads; j++)
499 thread = &dpsoftrast.threads[j];
500 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501 if (thread->texbound[i])
502 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
506 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
515 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
516 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
517 DPSOFTRAST_Texture *texture;
518 if (width*height*depth < 1)
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
523 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
530 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
531 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
532 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
534 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
535 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
537 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
542 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
545 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
552 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
554 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
557 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
559 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
562 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
567 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
572 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
574 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
577 // find first empty slot in texture array
578 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
579 if (!dpsoftrast.texture[texnum].bytes)
581 dpsoftrast.texture_firstfree = texnum + 1;
582 if (dpsoftrast.texture_max <= texnum)
583 DPSOFTRAST_Texture_Grow();
584 if (dpsoftrast.texture_end <= texnum)
585 dpsoftrast.texture_end = texnum + 1;
586 texture = &dpsoftrast.texture[texnum];
587 memset(texture, 0, sizeof(*texture));
588 texture->flags = flags;
589 texture->width = width;
590 texture->height = height;
591 texture->depth = depth;
592 texture->sides = sides;
604 s = w * h * d * sides * 4;
605 texture->mipmap[mipmaps][0] = size;
606 texture->mipmap[mipmaps][1] = s;
607 texture->mipmap[mipmaps][2] = w;
608 texture->mipmap[mipmaps][3] = h;
609 texture->mipmap[mipmaps][4] = d;
612 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
618 texture->mipmaps = mipmaps;
619 texture->size = size;
621 // allocate the pixels now
622 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
626 void DPSOFTRAST_Texture_Free(int index)
628 DPSOFTRAST_Texture *texture;
629 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
633 MM_FREE(texture->bytes);
634 texture->bytes = NULL;
635 memset(texture, 0, sizeof(*texture));
636 // adjust the free range and used range
637 if (dpsoftrast.texture_firstfree > index)
638 dpsoftrast.texture_firstfree = index;
639 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
640 dpsoftrast.texture_end--;
642 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
644 int i, x, y, z, w, layer0, layer1, row0, row1;
645 unsigned char *o, *i0, *i1, *i2, *i3;
646 DPSOFTRAST_Texture *texture;
647 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648 if (texture->mipmaps <= 1)
650 for (i = 1;i < texture->mipmaps;i++)
652 for (z = 0;z < texture->mipmap[i][4];z++)
656 if (layer1 >= texture->mipmap[i-1][4])
657 layer1 = texture->mipmap[i-1][4]-1;
658 for (y = 0;y < texture->mipmap[i][3];y++)
662 if (row1 >= texture->mipmap[i-1][3])
663 row1 = texture->mipmap[i-1][3]-1;
664 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
665 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
666 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
667 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
668 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
669 w = texture->mipmap[i][2];
672 if (texture->mipmap[i-1][2] > 1)
674 // average 3D texture
675 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
677 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
678 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
679 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
680 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
685 // average 3D mipmap with parent width == 1
686 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
688 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
689 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
690 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
691 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
697 if (texture->mipmap[i-1][2] > 1)
699 // average 2D texture (common case)
700 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
702 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
703 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
704 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
705 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
710 // 2D texture with parent width == 1
711 o[0] = (i0[0] + i1[0] + 1) >> 1;
712 o[1] = (i0[1] + i1[1] + 1) >> 1;
713 o[2] = (i0[2] + i1[2] + 1) >> 1;
714 o[3] = (i0[3] + i1[3] + 1) >> 1;
721 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
723 DPSOFTRAST_Texture *texture;
725 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
730 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
731 while (blockheight > 0)
733 memcpy(dst, pixels, blockwidth * 4);
734 pixels += blockwidth * 4;
735 dst += texture->mipmap[0][2] * 4;
739 DPSOFTRAST_Texture_CalculateMipmaps(index);
741 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
743 DPSOFTRAST_Texture *texture;
744 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
748 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
749 DPSOFTRAST_Texture_CalculateMipmaps(index);
751 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755 return texture->mipmap[mip][2];
757 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761 return texture->mipmap[mip][3];
763 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
765 DPSOFTRAST_Texture *texture;
766 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767 return texture->mipmap[mip][4];
769 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
771 DPSOFTRAST_Texture *texture;
772 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
775 return texture->bytes + texture->mipmap[mip][0];
777 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
779 DPSOFTRAST_Texture *texture;
780 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
781 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
783 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
788 texture->filter = filter;
791 static void DPSOFTRAST_Draw_FlushThreads(void);
793 static void DPSOFTRAST_Draw_SyncCommands(void)
795 if(dpsoftrast.usethreads) MEMORY_BARRIER;
796 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
799 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
801 DPSOFTRAST_State_Thread *thread;
803 int freecommand = dpsoftrast.commandpool.freecommand;
804 int usedcommands = dpsoftrast.commandpool.usedcommands;
805 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
807 DPSOFTRAST_Draw_SyncCommands();
813 for (i = 0; i < dpsoftrast.numthreads; i++)
815 thread = &dpsoftrast.threads[i];
816 commandoffset = freecommand - thread->commandoffset;
817 if (commandoffset < 0)
818 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
819 if (commandoffset > usedcommands)
822 usedcommands = commandoffset;
825 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
827 thread = &dpsoftrast.threads[waitindex];
828 Thread_LockMutex(thread->drawmutex);
829 if (thread->commandoffset != dpsoftrast.drawcommand)
831 thread->waiting = true;
832 if (thread->starving) Thread_CondSignal(thread->drawcond);
833 Thread_CondWait(thread->waitcond, thread->drawmutex);
834 thread->waiting = false;
836 Thread_UnlockMutex(thread->drawmutex);
838 dpsoftrast.commandpool.usedcommands = usedcommands;
841 #define DPSOFTRAST_ALIGNCOMMAND(size) \
842 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
843 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
844 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
846 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
848 DPSOFTRAST_Command *command;
849 int freecommand = dpsoftrast.commandpool.freecommand;
850 int usedcommands = dpsoftrast.commandpool.usedcommands;
851 int extra = sizeof(DPSOFTRAST_Command);
852 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
854 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
856 if (dpsoftrast.usethreads)
857 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
859 DPSOFTRAST_Draw_FlushThreads();
860 freecommand = dpsoftrast.commandpool.freecommand;
861 usedcommands = dpsoftrast.commandpool.usedcommands;
863 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
865 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
866 command->opcode = DPSOFTRAST_OPCODE_Reset;
867 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
870 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
871 command->opcode = opcode;
872 command->commandsize = size;
874 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
876 dpsoftrast.commandpool.freecommand = freecommand;
877 dpsoftrast.commandpool.usedcommands = usedcommands + size;
881 static void DPSOFTRAST_UndoCommand(int size)
883 int freecommand = dpsoftrast.commandpool.freecommand;
884 int usedcommands = dpsoftrast.commandpool.usedcommands;
887 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
888 usedcommands -= size;
889 dpsoftrast.commandpool.freecommand = freecommand;
890 dpsoftrast.commandpool.usedcommands = usedcommands;
893 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
894 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
896 thread->viewport[0] = command->x;
897 thread->viewport[1] = command->y;
898 thread->viewport[2] = command->width;
899 thread->viewport[3] = command->height;
900 thread->validate |= DPSOFTRAST_VALIDATE_FB;
902 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
904 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
907 command->width = width;
908 command->height = height;
910 dpsoftrast.viewport[0] = x;
911 dpsoftrast.viewport[1] = y;
912 dpsoftrast.viewport[2] = width;
913 dpsoftrast.viewport[3] = height;
914 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
917 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
918 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
920 int i, x1, y1, x2, y2, w, h, x, y;
921 int miny1, maxy1, miny2, maxy2;
925 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
926 miny1 = thread->miny1;
927 maxy1 = thread->maxy1;
928 miny2 = thread->miny2;
929 maxy2 = thread->maxy2;
930 x1 = thread->fb_scissor[0];
931 y1 = thread->fb_scissor[1];
932 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
933 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
934 if (y1 < miny1) y1 = miny1;
935 if (y2 > maxy2) y2 = maxy2;
940 // FIXME: honor fb_colormask?
941 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
942 for (i = 0;i < 4;i++)
944 if (!dpsoftrast.fb_colorpixels[i])
946 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
949 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
950 for (x = x1;x < x2;x++)
955 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
957 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
964 DEFCOMMAND(3, ClearDepth, float depth;)
965 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
967 int x1, y1, x2, y2, w, h, x, y;
968 int miny1, maxy1, miny2, maxy2;
972 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
973 miny1 = thread->miny1;
974 maxy1 = thread->maxy1;
975 miny2 = thread->miny2;
976 maxy2 = thread->maxy2;
977 x1 = thread->fb_scissor[0];
978 y1 = thread->fb_scissor[1];
979 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
980 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
981 if (y1 < miny1) y1 = miny1;
982 if (y2 > maxy2) y2 = maxy2;
987 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
988 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
991 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
992 for (x = x1;x < x2;x++)
996 void DPSOFTRAST_ClearDepth(float d)
998 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1002 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1003 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1005 thread->colormask[0] = command->r != 0;
1006 thread->colormask[1] = command->g != 0;
1007 thread->colormask[2] = command->b != 0;
1008 thread->colormask[3] = command->a != 0;
1009 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1011 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1013 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1020 DEFCOMMAND(5, DepthTest, int enable;)
1021 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1023 thread->depthtest = command->enable;
1024 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1026 void DPSOFTRAST_DepthTest(int enable)
1028 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1029 command->enable = enable;
1032 DEFCOMMAND(6, ScissorTest, int enable;)
1033 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1035 thread->scissortest = command->enable;
1036 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1038 void DPSOFTRAST_ScissorTest(int enable)
1040 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1041 command->enable = enable;
1044 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1045 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1047 thread->scissor[0] = command->x;
1048 thread->scissor[1] = command->y;
1049 thread->scissor[2] = command->width;
1050 thread->scissor[3] = command->height;
1051 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1053 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1055 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1058 command->width = width;
1059 command->height = height;
1062 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1063 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1065 thread->blendfunc[0] = command->sfactor;
1066 thread->blendfunc[1] = command->dfactor;
1067 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1071 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1072 command->sfactor = sfactor;
1073 command->dfactor = dfactor;
1076 DEFCOMMAND(9, BlendSubtract, int enable;)
1077 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1079 thread->blendsubtract = command->enable;
1080 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1082 void DPSOFTRAST_BlendSubtract(int enable)
1084 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1085 command->enable = enable;
1088 DEFCOMMAND(10, DepthMask, int enable;)
1089 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1091 thread->depthmask = command->enable;
1093 void DPSOFTRAST_DepthMask(int enable)
1095 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1096 command->enable = enable;
1099 DEFCOMMAND(11, DepthFunc, int func;)
1100 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1102 thread->depthfunc = command->func;
1104 void DPSOFTRAST_DepthFunc(int func)
1106 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1107 command->func = func;
1110 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1111 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1113 thread->depthrange[0] = command->nearval;
1114 thread->depthrange[1] = command->farval;
1116 void DPSOFTRAST_DepthRange(float nearval, float farval)
1118 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1119 command->nearval = nearval;
1120 command->farval = farval;
1123 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1124 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1126 thread->polygonoffset[0] = command->alongnormal;
1127 thread->polygonoffset[1] = command->intoview;
1129 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1131 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1132 command->alongnormal = alongnormal;
1133 command->intoview = intoview;
1136 DEFCOMMAND(14, CullFace, int mode;)
1137 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1139 thread->cullface = command->mode;
1141 void DPSOFTRAST_CullFace(int mode)
1143 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1144 command->mode = mode;
1147 DEFCOMMAND(15, AlphaTest, int enable;)
1148 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1150 thread->alphatest = command->enable;
1152 void DPSOFTRAST_AlphaTest(int enable)
1154 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1155 command->enable = enable;
1158 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1159 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1161 thread->alphafunc = command->func;
1162 thread->alphavalue = command->ref;
1164 void DPSOFTRAST_AlphaFunc(int func, float ref)
1166 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1167 command->func = func;
1171 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1173 dpsoftrast.color[0] = r;
1174 dpsoftrast.color[1] = g;
1175 dpsoftrast.color[2] = b;
1176 dpsoftrast.color[3] = a;
1179 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1181 int outstride = blockwidth * 4;
1182 int instride = dpsoftrast.fb_width * 4;
1185 int bx2 = blockx + blockwidth;
1186 int by2 = blocky + blockheight;
1190 unsigned char *inpixels;
1194 if (bx1 < 0) bx1 = 0;
1195 if (by1 < 0) by1 = 0;
1196 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1197 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1199 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1200 if (dpsoftrast.bigendian)
1202 for (y = by1;y < by2;y++)
1204 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1205 o = (unsigned char *)outpixels + (y - by1) * outstride;
1206 for (x = bx1;x < bx2;x++)
1219 for (y = by1;y < by2;y++)
1221 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1222 o = (unsigned char *)outpixels + (y - by1) * outstride;
1228 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1232 int tx2 = tx + width;
1233 int ty2 = ty + height;
1236 int sx2 = sx + width;
1237 int sy2 = sy + height;
1247 unsigned int *spixels;
1248 unsigned int *tpixels;
1249 DPSOFTRAST_Texture *texture;
1250 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1251 if (mip < 0 || mip >= texture->mipmaps) return;
1253 spixels = dpsoftrast.fb_colorpixels[0];
1254 swidth = dpsoftrast.fb_width;
1255 sheight = dpsoftrast.fb_height;
1256 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1257 twidth = texture->mipmap[mip][2];
1258 theight = texture->mipmap[mip][3];
1259 if (tx1 < 0) tx1 = 0;
1260 if (ty1 < 0) ty1 = 0;
1261 if (tx2 > twidth) tx2 = twidth;
1262 if (ty2 > theight) ty2 = theight;
1263 if (sx1 < 0) sx1 = 0;
1264 if (sy1 < 0) sy1 = 0;
1265 if (sx2 > swidth) sx2 = swidth;
1266 if (sy2 > sheight) sy2 = sheight;
1271 if (tw > sw) tw = sw;
1272 if (th > sh) th = sh;
1273 if (tw < 1 || th < 1)
1275 sy1 = sheight - 1 - sy1;
1276 for (y = 0;y < th;y++)
1277 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1278 if (texture->mipmaps > 1)
1279 DPSOFTRAST_Texture_CalculateMipmaps(index);
1282 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1283 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1285 if (thread->texbound[command->unitnum])
1286 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1287 thread->texbound[command->unitnum] = command->texture;
1289 void DPSOFTRAST_SetTexture(int unitnum, int index)
1291 DPSOFTRAST_Command_SetTexture *command;
1292 DPSOFTRAST_Texture *texture;
1293 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1295 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1298 texture = DPSOFTRAST_Texture_GetByIndex(index);
1299 if (index && !texture)
1301 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1305 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1306 command->unitnum = unitnum;
1307 command->texture = texture;
1309 dpsoftrast.texbound[unitnum] = texture;
1310 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1313 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1315 dpsoftrast.pointer_vertex3f = vertex3f;
1316 dpsoftrast.stride_vertex = stride;
1318 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1320 dpsoftrast.pointer_color4f = color4f;
1321 dpsoftrast.pointer_color4ub = NULL;
1322 dpsoftrast.stride_color = stride;
1324 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1326 dpsoftrast.pointer_color4f = NULL;
1327 dpsoftrast.pointer_color4ub = color4ub;
1328 dpsoftrast.stride_color = stride;
1330 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1332 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1333 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1334 dpsoftrast.stride_texcoord[unitnum] = stride;
1337 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1338 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1340 thread->shader_mode = command->mode;
1341 thread->shader_permutation = command->permutation;
1342 thread->shader_exactspecularmath = command->exactspecularmath;
1344 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1346 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1347 command->mode = mode;
1348 command->permutation = permutation;
1349 command->exactspecularmath = exactspecularmath;
1351 dpsoftrast.shader_mode = mode;
1352 dpsoftrast.shader_permutation = permutation;
1353 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1356 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1357 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1359 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1361 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1363 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1364 command->index = index;
1365 command->val[0] = v0;
1366 command->val[1] = v1;
1367 command->val[2] = v2;
1368 command->val[3] = v3;
1370 dpsoftrast.uniform4f[index*4+0] = v0;
1371 dpsoftrast.uniform4f[index*4+1] = v1;
1372 dpsoftrast.uniform4f[index*4+2] = v2;
1373 dpsoftrast.uniform4f[index*4+3] = v3;
1375 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1377 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1378 command->index = index;
1379 memcpy(command->val, v, sizeof(command->val));
1381 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1384 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1385 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1387 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1389 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1393 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1395 __m128 m0, m1, m2, m3;
1396 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1397 command->index = (DPSOFTRAST_UNIFORM)index;
1398 if (((size_t)v)&(ALIGN_SIZE-1))
1400 m0 = _mm_loadu_ps(v);
1401 m1 = _mm_loadu_ps(v+4);
1402 m2 = _mm_loadu_ps(v+8);
1403 m3 = _mm_loadu_ps(v+12);
1407 m0 = _mm_load_ps(v);
1408 m1 = _mm_load_ps(v+4);
1409 m2 = _mm_load_ps(v+8);
1410 m3 = _mm_load_ps(v+12);
1414 __m128 t0, t1, t2, t3;
1415 t0 = _mm_unpacklo_ps(m0, m1);
1416 t1 = _mm_unpacklo_ps(m2, m3);
1417 t2 = _mm_unpackhi_ps(m0, m1);
1418 t3 = _mm_unpackhi_ps(m2, m3);
1419 m0 = _mm_movelh_ps(t0, t1);
1420 m1 = _mm_movehl_ps(t1, t0);
1421 m2 = _mm_movelh_ps(t2, t3);
1422 m3 = _mm_movehl_ps(t3, t2);
1424 _mm_store_ps(command->val, m0);
1425 _mm_store_ps(command->val+4, m1);
1426 _mm_store_ps(command->val+8, m2);
1427 _mm_store_ps(command->val+12, m3);
1428 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1429 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1430 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1431 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1436 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1437 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1439 thread->uniform1i[command->index] = command->val;
1441 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1443 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1444 command->index = index;
1447 dpsoftrast.uniform1i[command->index] = i0;
1450 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1451 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1453 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1454 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1456 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1458 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1459 command->clipplane[0] = x;
1460 command->clipplane[1] = y;
1461 command->clipplane[2] = z;
1462 command->clipplane[3] = w;
1466 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1468 float *end = dst + size*4;
1469 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1473 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1482 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1489 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1491 float *end = dst + size*4;
1492 if (stride == sizeof(float[3]))
1494 float *end4 = dst + (size&~3)*4;
1495 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1499 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1500 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1501 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1504 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1505 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1507 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1508 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1511 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513 src += 4*sizeof(float[3]);
1520 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1521 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1522 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1525 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1526 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1528 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1529 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1532 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 src += 4*sizeof(float[3]);
1538 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1542 __m128 v = _mm_loadu_ps((const float *)src);
1543 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1544 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1545 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1546 _mm_store_ps(dst, v);
1555 __m128 v = _mm_load_ps((const float *)src);
1556 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1557 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1558 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1559 _mm_store_ps(dst, v);
1566 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1568 float *end = dst + size*4;
1569 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1570 if (stride == sizeof(float[2]))
1572 float *end2 = dst + (size&~1)*4;
1573 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1577 __m128 v = _mm_loadu_ps((const float *)src);
1578 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1579 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1581 src += 2*sizeof(float[2]);
1588 __m128 v = _mm_load_ps((const float *)src);
1589 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1590 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1592 src += 2*sizeof(float[2]);
1598 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1604 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1606 float *end = dst + size*4;
1607 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1608 if (stride == sizeof(unsigned char[4]))
1610 float *end4 = dst + (size&~3)*4;
1611 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1615 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1616 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1617 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1618 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1619 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1621 src += 4*sizeof(unsigned char[4]);
1628 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1629 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1630 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1631 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1632 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1634 src += 4*sizeof(unsigned char[4]);
1640 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1641 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1647 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1649 float *end = dst + 4*size;
1650 __m128 v = _mm_loadu_ps(src);
1653 _mm_store_ps(dst, v);
1659 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1662 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1663 __m128 m0, m1, m2, m3;
1665 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1667 // fast case for identity matrix
1668 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1671 end = out4f + numitems*4;
1672 m0 = _mm_loadu_ps(inmatrix16f);
1673 m1 = _mm_loadu_ps(inmatrix16f + 4);
1674 m2 = _mm_loadu_ps(inmatrix16f + 8);
1675 m3 = _mm_loadu_ps(inmatrix16f + 12);
1676 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1680 __m128 v = _mm_loadu_ps(in4f);
1682 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1683 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1684 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1685 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1694 __m128 v = _mm_load_ps(in4f);
1696 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1697 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1698 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1699 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1707 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1709 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1713 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1715 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1716 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1717 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1718 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1721 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1723 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1724 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1725 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1726 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1732 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1734 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1735 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1738 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1740 int clipmask = 0xFF;
1741 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1742 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1743 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1744 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1745 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1746 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1747 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1748 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1749 #define BBFRONT(k, pos) \
1751 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1752 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1753 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1756 clipmask &= ~(1<<k); \
1757 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1758 minproj = _mm_min_ss(minproj, proj); \
1759 maxproj = _mm_max_ss(maxproj, proj); \
1763 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1764 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1766 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1767 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1768 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1772 if (clipmask&(1<<k)) \
1774 if (!(clipmask&(1<<(k^1)))) \
1776 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1777 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1778 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1779 minproj = _mm_min_ss(minproj, proj); \
1780 maxproj = _mm_max_ss(maxproj, proj); \
1782 if (!(clipmask&(1<<(k^2)))) \
1784 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1785 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1786 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1787 minproj = _mm_min_ss(minproj, proj); \
1788 maxproj = _mm_max_ss(maxproj, proj); \
1790 if (!(clipmask&(1<<(k^4)))) \
1792 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1793 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1794 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1795 minproj = _mm_min_ss(minproj, proj); \
1796 maxproj = _mm_max_ss(maxproj, proj); \
1800 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1801 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1802 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1803 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1804 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1805 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1806 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1807 *starty = _mm_cvttss_si32(maxproj);
1808 *endy = _mm_cvttss_si32(minproj)+1;
1812 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1814 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1815 float *end = out4f + numitems*4;
1816 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1817 __m128 minpos, maxpos;
1818 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1820 minpos = maxpos = _mm_loadu_ps(in4f);
1823 __m128 v = _mm_loadu_ps(in4f);
1824 minpos = _mm_min_ps(minpos, v);
1825 maxpos = _mm_max_ps(maxpos, v);
1826 _mm_store_ps(out4f, v);
1827 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828 _mm_store_ps(screen4f, v);
1836 minpos = maxpos = _mm_load_ps(in4f);
1839 __m128 v = _mm_load_ps(in4f);
1840 minpos = _mm_min_ps(minpos, v);
1841 maxpos = _mm_max_ps(maxpos, v);
1842 _mm_store_ps(out4f, v);
1843 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1844 _mm_store_ps(screen4f, v);
1852 ALIGN(float minposf[4]);
1853 ALIGN(float maxposf[4]);
1854 _mm_store_ps(minposf, minpos);
1855 _mm_store_ps(maxposf, maxpos);
1856 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1861 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1863 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1864 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1866 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1867 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1868 end = out4f + numitems*4;
1869 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1870 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1871 m0 = _mm_loadu_ps(inmatrix16f);
1872 m1 = _mm_loadu_ps(inmatrix16f + 4);
1873 m2 = _mm_loadu_ps(inmatrix16f + 8);
1874 m3 = _mm_loadu_ps(inmatrix16f + 12);
1875 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1877 minpos = maxpos = _mm_loadu_ps(in4f);
1880 __m128 v = _mm_loadu_ps(in4f);
1881 minpos = _mm_min_ps(minpos, v);
1882 maxpos = _mm_max_ps(maxpos, v);
1883 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1884 _mm_store_ps(out4f, v);
1885 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1886 _mm_store_ps(screen4f, v);
1894 minpos = maxpos = _mm_load_ps(in4f);
1897 __m128 v = _mm_load_ps(in4f);
1898 minpos = _mm_min_ps(minpos, v);
1899 maxpos = _mm_max_ps(maxpos, v);
1900 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1901 _mm_store_ps(out4f, v);
1902 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1903 _mm_store_ps(screen4f, v);
1911 ALIGN(float minposf[4]);
1912 ALIGN(float maxposf[4]);
1913 _mm_store_ps(minposf, minpos);
1914 _mm_store_ps(maxposf, maxpos);
1915 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1921 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1924 float *outf = dpsoftrast.post_array4f[outarray];
1925 const unsigned char *inb;
1926 int firstvertex = dpsoftrast.firstvertex;
1927 int numvertices = dpsoftrast.numvertices;
1931 case DPSOFTRAST_ARRAY_POSITION:
1932 stride = dpsoftrast.stride_vertex;
1933 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1934 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1936 case DPSOFTRAST_ARRAY_COLOR:
1937 stride = dpsoftrast.stride_color;
1938 if (dpsoftrast.pointer_color4f)
1940 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1943 else if (dpsoftrast.pointer_color4ub)
1945 stride = dpsoftrast.stride_color;
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1947 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1951 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1955 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1956 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1958 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1959 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1962 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1965 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1968 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1980 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1982 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1983 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1988 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1991 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2000 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2003 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2004 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2014 int startx = span->startx;
2015 int endx = span->endx;
2016 float wslope = triangle->w[0];
2017 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2018 float endz = 1.0f / (w + wslope * startx);
2019 if (triangle->w[0] == 0)
2021 // LordHavoc: fast flat polygons (HUD/menu)
2022 for (x = startx;x < endx;x++)
2026 for (x = startx;x < endx;)
2028 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2030 if (nextsub >= endx) nextsub = endsub = endx-1;
2031 endz = 1.0f / (w + wslope * nextsub);
2032 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2033 for (; x <= endsub; x++, z += dz)
2038 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2041 int startx = span->startx;
2042 int endx = span->endx;
2045 unsigned char * RESTRICT pixelmask = span->pixelmask;
2046 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2049 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2050 // handle alphatest now (this affects depth writes too)
2051 if (thread->alphatest)
2052 for (x = startx;x < endx;x++)
2053 if (in4f[x*4+3] < 0.5f)
2054 pixelmask[x] = false;
2055 // FIXME: this does not handle bigendian
2056 switch(thread->fb_blendmode)
2058 case DPSOFTRAST_BLENDMODE_OPAQUE:
2059 for (x = startx;x < endx;x++)
2063 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2064 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2065 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2066 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2067 pixel[x*4+0] = d[0];
2068 pixel[x*4+1] = d[1];
2069 pixel[x*4+2] = d[2];
2070 pixel[x*4+3] = d[3];
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 for (x = startx;x < endx;x++)
2078 a = in4f[x*4+3] * 255.0f;
2079 b = 1.0f - in4f[x*4+3];
2080 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2081 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2082 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2083 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2084 pixel[x*4+0] = d[0];
2085 pixel[x*4+1] = d[1];
2086 pixel[x*4+2] = d[2];
2087 pixel[x*4+3] = d[3];
2090 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2091 for (x = startx;x < endx;x++)
2095 a = in4f[x*4+3] * 255.0f;
2096 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2097 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2098 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2099 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2100 pixel[x*4+0] = d[0];
2101 pixel[x*4+1] = d[1];
2102 pixel[x*4+2] = d[2];
2103 pixel[x*4+3] = d[3];
2106 case DPSOFTRAST_BLENDMODE_ADD:
2107 for (x = startx;x < endx;x++)
2111 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2112 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2113 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2114 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2115 pixel[x*4+0] = d[0];
2116 pixel[x*4+1] = d[1];
2117 pixel[x*4+2] = d[2];
2118 pixel[x*4+3] = d[3];
2121 case DPSOFTRAST_BLENDMODE_INVMOD:
2122 for (x = startx;x < endx;x++)
2126 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2127 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2128 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2129 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2130 pixel[x*4+0] = d[0];
2131 pixel[x*4+1] = d[1];
2132 pixel[x*4+2] = d[2];
2133 pixel[x*4+3] = d[3];
2136 case DPSOFTRAST_BLENDMODE_MUL:
2137 for (x = startx;x < endx;x++)
2141 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2142 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2144 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145 pixel[x*4+0] = d[0];
2146 pixel[x*4+1] = d[1];
2147 pixel[x*4+2] = d[2];
2148 pixel[x*4+3] = d[3];
2151 case DPSOFTRAST_BLENDMODE_MUL2:
2152 for (x = startx;x < endx;x++)
2156 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2157 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2158 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2159 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2160 pixel[x*4+0] = d[0];
2161 pixel[x*4+1] = d[1];
2162 pixel[x*4+2] = d[2];
2163 pixel[x*4+3] = d[3];
2166 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2167 for (x = startx;x < endx;x++)
2171 a = in4f[x*4+3] * -255.0f;
2172 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2173 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2174 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2175 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2176 pixel[x*4+0] = d[0];
2177 pixel[x*4+1] = d[1];
2178 pixel[x*4+2] = d[2];
2179 pixel[x*4+3] = d[3];
2182 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2183 for (x = startx;x < endx;x++)
2188 b = 1.0f - in4f[x*4+3];
2189 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2190 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2191 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2192 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2193 pixel[x*4+0] = d[0];
2194 pixel[x*4+1] = d[1];
2195 pixel[x*4+2] = d[2];
2196 pixel[x*4+3] = d[3];
2199 case DPSOFTRAST_BLENDMODE_INVADD:
2200 for (x = startx;x < endx;x++)
2204 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2205 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2206 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2207 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2208 pixel[x*4+0] = d[0];
2209 pixel[x*4+1] = d[1];
2210 pixel[x*4+2] = d[2];
2211 pixel[x*4+3] = d[3];
2217 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2221 int startx = span->startx;
2222 int endx = span->endx;
2224 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2225 unsigned char * RESTRICT pixelmask = span->pixelmask;
2226 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2227 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2230 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2231 pixeli += span->y * dpsoftrast.fb_width + span->x;
2232 // handle alphatest now (this affects depth writes too)
2233 if (thread->alphatest)
2234 for (x = startx;x < endx;x++)
2235 if (in4ub[x*4+3] < 128)
2236 pixelmask[x] = false;
2237 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2238 // helps sprites, text and hud artwork
2239 switch(thread->fb_blendmode)
2241 case DPSOFTRAST_BLENDMODE_ALPHA:
2242 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2244 for (x = startx;x < endx;x++)
2245 if (in4ub[x*4+3] < 1)
2246 pixelmask[x] = false;
2248 case DPSOFTRAST_BLENDMODE_OPAQUE:
2249 case DPSOFTRAST_BLENDMODE_ADD:
2250 case DPSOFTRAST_BLENDMODE_INVMOD:
2251 case DPSOFTRAST_BLENDMODE_MUL:
2252 case DPSOFTRAST_BLENDMODE_MUL2:
2253 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2254 case DPSOFTRAST_BLENDMODE_INVADD:
2257 // put some special values at the end of the mask to ensure the loops end
2258 pixelmask[endx] = 1;
2259 pixelmask[endx+1] = 0;
2260 // LordHavoc: use a double loop to identify subspans, this helps the
2261 // optimized copy/blend loops to perform at their best, most triangles
2262 // have only one run of pixels, and do the search using wide reads...
2266 // if this pixel is masked off, it's probably not alone...
2273 // the 4-item search must be aligned or else it stalls badly
2274 if ((x & 3) && !pixelmask[x]) x++;
2275 if ((x & 3) && !pixelmask[x]) x++;
2276 if ((x & 3) && !pixelmask[x]) x++;
2277 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2281 for (;!pixelmask[x];x++)
2283 // rather than continue the loop, just check the end variable
2287 // find length of subspan
2292 if ((subx & 3) && pixelmask[subx]) subx++;
2293 if ((subx & 3) && pixelmask[subx]) subx++;
2294 if ((subx & 3) && pixelmask[subx]) subx++;
2295 while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2299 for (;pixelmask[subx];subx++)
2301 // the checks can overshoot, so make sure to clip it...
2304 // now that we know the subspan length... process!
2305 switch(thread->fb_blendmode)
2307 case DPSOFTRAST_BLENDMODE_OPAQUE:
2311 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2316 while (x + 16 <= subx)
2318 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2319 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2320 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2321 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2326 while (x + 4 <= subx)
2328 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2334 pixeli[x+1] = ini[x+1];
2344 case DPSOFTRAST_BLENDMODE_ALPHA:
2345 #define FINISHBLEND(blend2, blend1) \
2346 for (;x + 1 < subx;x += 2) \
2349 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2350 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2352 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2357 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2358 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2360 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2364 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2365 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2367 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2368 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2371 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2373 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2374 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2376 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2377 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2380 case DPSOFTRAST_BLENDMODE_ADD:
2381 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2383 case DPSOFTRAST_BLENDMODE_INVMOD:
2385 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2387 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2390 case DPSOFTRAST_BLENDMODE_MUL:
2391 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2393 case DPSOFTRAST_BLENDMODE_MUL2:
2394 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2396 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2398 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2399 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2401 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2402 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2405 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2407 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2408 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2410 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2411 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2414 case DPSOFTRAST_BLENDMODE_INVADD:
2416 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2418 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2426 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2429 int startx = span->startx;
2430 int endx = span->endx;
2435 float tc[2], endtc[2];
2437 unsigned int tci[2];
2438 unsigned int tci1[2];
2439 unsigned int tcimin[2];
2440 unsigned int tcimax[2];
2445 const unsigned char * RESTRICT pixelbase;
2446 const unsigned char * RESTRICT pixel[4];
2447 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2448 // if no texture is bound, just fill it with white
2451 for (x = startx;x < endx;x++)
2453 out4f[x*4+0] = 1.0f;
2454 out4f[x*4+1] = 1.0f;
2455 out4f[x*4+2] = 1.0f;
2456 out4f[x*4+3] = 1.0f;
2460 mip = triangle->mip[texunitindex];
2461 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2462 // if this mipmap of the texture is 1 pixel, just fill it with that color
2463 if (texture->mipmap[mip][1] == 4)
2465 c[0] = texture->bytes[2] * (1.0f/255.0f);
2466 c[1] = texture->bytes[1] * (1.0f/255.0f);
2467 c[2] = texture->bytes[0] * (1.0f/255.0f);
2468 c[3] = texture->bytes[3] * (1.0f/255.0f);
2469 for (x = startx;x < endx;x++)
2471 out4f[x*4+0] = c[0];
2472 out4f[x*4+1] = c[1];
2473 out4f[x*4+2] = c[2];
2474 out4f[x*4+3] = c[3];
2478 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2479 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2480 flags = texture->flags;
2481 tcscale[0] = texture->mipmap[mip][2];
2482 tcscale[1] = texture->mipmap[mip][3];
2483 tciwidth = texture->mipmap[mip][2];
2486 tcimax[0] = texture->mipmap[mip][2]-1;
2487 tcimax[1] = texture->mipmap[mip][3]-1;
2488 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2489 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2490 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2491 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2497 for (x = startx;x < endx;)
2499 unsigned int subtc[2];
2500 unsigned int substep[2];
2501 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2502 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2503 if (nextsub >= endx)
2505 nextsub = endsub = endx-1;
2506 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2510 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2511 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2517 substep[0] = (endtc[0] - tc[0]) * subscale;
2518 substep[1] = (endtc[1] - tc[1]) * subscale;
2519 subtc[0] = tc[0] * (1<<12);
2520 subtc[1] = tc[1] * (1<<12);
2523 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2525 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2527 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2528 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2529 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2530 tci[0] = subtc[0]>>12;
2531 tci[1] = subtc[1]>>12;
2532 tci1[0] = tci[0] + 1;
2533 tci1[1] = tci[1] + 1;
2534 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2535 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2536 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2537 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2538 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2539 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2540 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2541 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2542 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2543 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2544 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2545 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2546 out4f[x*4+0] = c[0];
2547 out4f[x*4+1] = c[1];
2548 out4f[x*4+2] = c[2];
2549 out4f[x*4+3] = c[3];
2554 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2556 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2557 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2558 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2559 tci[0] = subtc[0]>>12;
2560 tci[1] = subtc[1]>>12;
2561 tci1[0] = tci[0] + 1;
2562 tci1[1] = tci[1] + 1;
2563 tci[0] &= tciwrapmask[0];
2564 tci[1] &= tciwrapmask[1];
2565 tci1[0] &= tciwrapmask[0];
2566 tci1[1] &= tciwrapmask[1];
2567 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2568 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2569 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2570 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2571 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2572 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2573 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2574 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2575 out4f[x*4+0] = c[0];
2576 out4f[x*4+1] = c[1];
2577 out4f[x*4+2] = c[2];
2578 out4f[x*4+3] = c[3];
2582 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2584 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2586 tci[0] = subtc[0]>>12;
2587 tci[1] = subtc[1]>>12;
2588 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2589 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2590 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2591 c[0] = pixel[0][2] * (1.0f / 255.0f);
2592 c[1] = pixel[0][1] * (1.0f / 255.0f);
2593 c[2] = pixel[0][0] * (1.0f / 255.0f);
2594 c[3] = pixel[0][3] * (1.0f / 255.0f);
2595 out4f[x*4+0] = c[0];
2596 out4f[x*4+1] = c[1];
2597 out4f[x*4+2] = c[2];
2598 out4f[x*4+3] = c[3];
2603 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2605 tci[0] = subtc[0]>>12;
2606 tci[1] = subtc[1]>>12;
2607 tci[0] &= tciwrapmask[0];
2608 tci[1] &= tciwrapmask[1];
2609 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2610 c[0] = pixel[0][2] * (1.0f / 255.0f);
2611 c[1] = pixel[0][1] * (1.0f / 255.0f);
2612 c[2] = pixel[0][0] * (1.0f / 255.0f);
2613 c[3] = pixel[0][3] * (1.0f / 255.0f);
2614 out4f[x*4+0] = c[0];
2615 out4f[x*4+1] = c[1];
2616 out4f[x*4+2] = c[2];
2617 out4f[x*4+3] = c[3];
2623 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2627 int startx = span->startx;
2628 int endx = span->endx;
2630 __m128 data, slope, tcscale;
2631 __m128i tcsize, tcmask, tcoffset, tcmax;
2633 __m128i subtc, substep, endsubtc;
2636 int affine; // LordHavoc: optimized affine texturing case
2637 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2638 const unsigned char * RESTRICT pixelbase;
2639 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2640 // if no texture is bound, just fill it with white
2643 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2646 mip = triangle->mip[texunitindex];
2647 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2648 // if this mipmap of the texture is 1 pixel, just fill it with that color
2649 if (texture->mipmap[mip][1] == 4)
2651 unsigned int k = *((const unsigned int *)pixelbase);
2652 for (x = startx;x < endx;x++)
2656 affine = zf[startx] == zf[endx-1];
2657 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2658 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2659 flags = texture->flags;
2660 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2661 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2662 tcscale = _mm_cvtepi32_ps(tcsize);
2663 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2664 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2665 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2667 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2668 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2669 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2670 tcmax = _mm_packs_epi32(tcmask, tcmask);
2671 for (x = startx;x < endx;)
2673 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2674 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2675 if (nextsub >= endx || affine)
2677 nextsub = endsub = endx-1;
2678 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2682 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2684 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2685 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2686 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2687 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2688 substep = _mm_slli_epi32(substep, 1);
2691 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2692 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2694 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2695 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2697 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2698 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2699 tci = _mm_madd_epi16(tci, tcoffset);
2700 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2701 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2702 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2703 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2704 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2705 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2706 fracm = _mm_srli_epi16(subtc, 1);
2707 pix1 = _mm_add_epi16(pix1,
2708 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2709 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2710 pix3 = _mm_add_epi16(pix3,
2711 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2712 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2713 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2714 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2715 pix2 = _mm_add_epi16(pix2,
2716 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2717 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2718 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2722 const unsigned char * RESTRICT ptr1;
2723 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2724 tci = _mm_madd_epi16(tci, tcoffset);
2725 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2726 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2727 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2728 fracm = _mm_srli_epi16(subtc, 1);
2729 pix1 = _mm_add_epi16(pix1,
2730 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2731 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2732 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2733 pix1 = _mm_add_epi16(pix1,
2734 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2735 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2736 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2740 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2742 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2744 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2745 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2746 tci = _mm_madd_epi16(tci, tcoffset);
2747 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2748 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2749 _mm_setzero_si128());
2750 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2751 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2752 _mm_setzero_si128());
2753 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2754 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2755 tci = _mm_madd_epi16(tci, tcoffset);
2756 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2757 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2758 _mm_setzero_si128());
2759 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2760 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2761 _mm_setzero_si128());
2762 fracm = _mm_srli_epi16(subtc, 1);
2763 pix1 = _mm_add_epi16(pix1,
2764 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2765 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2766 pix3 = _mm_add_epi16(pix3,
2767 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2768 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2769 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2770 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2771 pix2 = _mm_add_epi16(pix2,
2772 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2773 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2774 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2778 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2779 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2780 tci = _mm_madd_epi16(tci, tcoffset);
2781 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2782 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2783 _mm_setzero_si128());
2784 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2785 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2786 _mm_setzero_si128());
2787 fracm = _mm_srli_epi16(subtc, 1);
2788 pix1 = _mm_add_epi16(pix1,
2789 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2790 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2791 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2792 pix1 = _mm_add_epi16(pix1,
2793 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2794 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2795 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2801 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2803 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2804 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2805 tci = _mm_madd_epi16(tci, tcoffset);
2806 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2807 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2808 _mm_setzero_si128());
2809 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2810 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2811 _mm_setzero_si128());
2812 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2813 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2814 tci = _mm_madd_epi16(tci, tcoffset);
2815 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2816 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2817 _mm_setzero_si128());
2818 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2819 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2820 _mm_setzero_si128());
2821 fracm = _mm_srli_epi16(subtc, 1);
2822 pix1 = _mm_add_epi16(pix1,
2823 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2824 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2825 pix3 = _mm_add_epi16(pix3,
2826 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2827 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2828 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2829 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2830 pix2 = _mm_add_epi16(pix2,
2831 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2832 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2833 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2837 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2838 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2839 tci = _mm_madd_epi16(tci, tcoffset);
2840 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2841 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2842 _mm_setzero_si128());
2843 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2844 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2845 _mm_setzero_si128());
2846 fracm = _mm_srli_epi16(subtc, 1);
2847 pix1 = _mm_add_epi16(pix1,
2848 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2849 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2850 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2851 pix1 = _mm_add_epi16(pix1,
2852 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2853 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2854 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2861 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2863 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2865 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2866 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2867 tci = _mm_madd_epi16(tci, tcoffset);
2868 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2869 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2873 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2874 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2875 tci = _mm_madd_epi16(tci, tcoffset);
2876 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2882 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2884 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2885 tci = _mm_and_si128(tci, tcmax);
2886 tci = _mm_madd_epi16(tci, tcoffset);
2887 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2888 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2892 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2893 tci = _mm_and_si128(tci, tcmax);
2894 tci = _mm_madd_epi16(tci, tcoffset);
2895 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2904 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2907 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2910 float DPSOFTRAST_SampleShadowmap(const float *vector)
2916 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2919 int startx = span->startx;
2920 int endx = span->endx;
2925 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2926 for (x = startx;x < endx;x++)
2929 c[0] = (data[0] + slope[0]*x) * z;
2930 c[1] = (data[1] + slope[1]*x) * z;
2931 c[2] = (data[2] + slope[2]*x) * z;
2932 c[3] = (data[3] + slope[3]*x) * z;
2933 out4f[x*4+0] = in4f[x*4+0] * c[0];
2934 out4f[x*4+1] = in4f[x*4+1] * c[1];
2935 out4f[x*4+2] = in4f[x*4+2] * c[2];
2936 out4f[x*4+3] = in4f[x*4+3] * c[3];
2940 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2943 int startx = span->startx;
2944 int endx = span->endx;
2949 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2950 for (x = startx;x < endx;x++)
2953 c[0] = (data[0] + slope[0]*x) * z;
2954 c[1] = (data[1] + slope[1]*x) * z;
2955 c[2] = (data[2] + slope[2]*x) * z;
2956 c[3] = (data[3] + slope[3]*x) * z;
2957 out4f[x*4+0] = c[0];
2958 out4f[x*4+1] = c[1];
2959 out4f[x*4+2] = c[2];
2960 out4f[x*4+3] = c[3];
2964 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2966 int x, startx = span->startx, endx = span->endx;
2967 float c[4], localcolor[4];
2968 localcolor[0] = subcolor[0];
2969 localcolor[1] = subcolor[1];
2970 localcolor[2] = subcolor[2];
2971 localcolor[3] = subcolor[3];
2972 for (x = startx;x < endx;x++)
2974 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2975 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2976 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2977 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2978 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2979 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2980 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2981 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2985 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2987 int x, startx = span->startx, endx = span->endx;
2988 for (x = startx;x < endx;x++)
2990 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2991 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2992 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2993 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2997 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2999 int x, startx = span->startx, endx = span->endx;
3000 for (x = startx;x < endx;x++)
3002 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
3003 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
3004 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
3005 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3009 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3011 int x, startx = span->startx, endx = span->endx;
3013 for (x = startx;x < endx;x++)
3015 a = 1.0f - inb4f[x*4+3];
3017 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3018 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3019 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3020 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3024 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3026 int x, startx = span->startx, endx = span->endx;
3027 float localcolor[4], ilerp, lerp;
3028 localcolor[0] = color[0];
3029 localcolor[1] = color[1];
3030 localcolor[2] = color[2];
3031 localcolor[3] = color[3];
3032 ilerp = 1.0f - localcolor[3];
3033 lerp = localcolor[3];
3034 for (x = startx;x < endx;x++)
3036 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3037 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3038 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3039 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3045 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3049 int startx = span->startx;
3050 int endx = span->endx;
3053 __m128i submod, substep, endsubmod;
3054 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3055 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3056 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3057 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3058 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3059 for (x = startx; x < endx;)
3061 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3062 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3063 if (nextsub >= endx)
3065 nextsub = endsub = endx-1;
3066 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3070 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3071 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3072 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3073 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3074 substep = _mm_packs_epi32(substep, substep);
3075 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3077 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3078 pix = _mm_mulhi_epu16(pix, submod);
3079 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3083 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3084 pix = _mm_mulhi_epu16(pix, submod);
3085 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3092 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3096 int startx = span->startx;
3097 int endx = span->endx;
3100 __m128i submod, substep, endsubmod;
3101 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3102 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3103 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3104 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3105 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3106 for (x = startx; x < endx;)
3108 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3109 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3110 if (nextsub >= endx)
3112 nextsub = endsub = endx-1;
3113 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3117 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3118 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3119 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3120 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3121 substep = _mm_packs_epi32(substep, substep);
3122 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3124 __m128i pix = _mm_srai_epi16(submod, 4);
3125 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3129 __m128i pix = _mm_srai_epi16(submod, 4);
3130 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3137 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3140 int x, startx = span->startx, endx = span->endx;
3141 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3142 localcolor = _mm_packs_epi32(localcolor, localcolor);
3143 for (x = startx;x+2 <= endx;x+=2)
3145 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3146 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3147 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3148 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3152 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3153 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3154 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3155 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3160 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3163 int x, startx = span->startx, endx = span->endx;
3164 for (x = startx;x+2 <= endx;x+=2)
3166 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3167 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3168 pix1 = _mm_mulhi_epu16(pix1, pix2);
3169 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3173 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3174 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3175 pix1 = _mm_mulhi_epu16(pix1, pix2);
3176 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3181 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3184 int x, startx = span->startx, endx = span->endx;
3185 for (x = startx;x+2 <= endx;x+=2)
3187 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3188 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3189 pix1 = _mm_add_epi16(pix1, pix2);
3190 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3194 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3195 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3196 pix1 = _mm_add_epi16(pix1, pix2);
3197 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3202 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3205 int x, startx = span->startx, endx = span->endx;
3206 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3207 tint = _mm_packs_epi32(tint, tint);
3208 for (x = startx;x+2 <= endx;x+=2)
3210 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3211 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3212 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3213 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3217 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3218 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3219 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3220 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3225 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3228 int x, startx = span->startx, endx = span->endx;
3229 for (x = startx;x+2 <= endx;x+=2)
3231 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3232 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3233 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3234 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3235 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3239 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3240 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3241 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3242 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3243 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3248 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3251 int x, startx = span->startx, endx = span->endx;
3252 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3253 localcolor = _mm_packs_epi32(localcolor, localcolor);
3254 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3255 for (x = startx;x+2 <= endx;x+=2)
3257 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3258 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3259 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3263 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3264 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3265 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3272 void DPSOFTRAST_VertexShader_Generic(void)
3274 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3275 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3276 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3277 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3278 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3281 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3284 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3286 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3287 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3288 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3290 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3291 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3292 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3294 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3295 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3298 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3300 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3303 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3305 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3308 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3313 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3314 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3319 void DPSOFTRAST_VertexShader_PostProcess(void)
3321 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3322 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3323 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3326 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3328 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3329 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3330 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3331 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3332 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3333 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3334 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3336 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3337 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3339 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3340 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3342 // TODO: implement saturation
3344 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3346 // TODO: implement gammaramps
3348 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3353 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3355 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3358 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3360 // this is never called (because colormask is off when this shader is used)
3361 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3364 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3365 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3370 void DPSOFTRAST_VertexShader_FlatColor(void)
3372 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3373 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3376 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3379 unsigned char * RESTRICT pixelmask = span->pixelmask;
3380 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3381 int x, startx = span->startx, endx = span->endx;
3382 __m128i Color_Ambientm;
3383 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3384 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3387 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3388 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3389 pixel = buffer_FragColorbgra8;
3390 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3391 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3392 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3393 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3394 for (x = startx;x < endx;x++)
3397 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3400 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3401 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3402 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3403 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3409 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3411 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3413 if (pixel == buffer_FragColorbgra8)
3414 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3420 void DPSOFTRAST_VertexShader_VertexColor(void)
3422 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3423 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3424 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3427 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3430 unsigned char * RESTRICT pixelmask = span->pixelmask;
3431 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3432 int x, startx = span->startx, endx = span->endx;
3433 __m128i Color_Ambientm, Color_Diffusem;
3435 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3436 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3437 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3438 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3439 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3440 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3441 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3442 pixel = buffer_FragColorbgra8;
3443 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3444 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3445 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3446 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3447 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3448 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3449 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3450 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3451 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3452 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3453 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3454 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3455 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3456 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3458 __m128i color, mod, pix;
3459 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3462 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3463 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3464 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3465 data = _mm_add_ps(data, slope);
3466 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3467 data = _mm_add_ps(data, slope);
3468 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3469 data = _mm_add_ps(data, slope);
3470 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3471 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3472 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3473 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3474 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3475 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3481 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3482 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3483 mod = _mm_packs_epi32(mod, mod);
3484 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3485 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3487 if (pixel == buffer_FragColorbgra8)
3488 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3494 void DPSOFTRAST_VertexShader_Lightmap(void)
3496 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3497 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3498 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3501 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3504 unsigned char * RESTRICT pixelmask = span->pixelmask;
3505 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3506 int x, startx = span->startx, endx = span->endx;
3507 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3508 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3509 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3510 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3514 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3515 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3516 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3517 pixel = buffer_FragColorbgra8;
3518 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3519 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3520 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3521 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3522 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3523 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3524 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3525 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3527 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3528 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3529 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3530 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3531 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3532 for (x = startx;x < endx;x++)
3534 __m128i color, lightmap, glow, pix;
3535 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3538 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3539 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3540 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3541 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3542 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3543 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3544 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3545 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3546 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3547 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3553 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3554 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3555 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3556 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3557 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3558 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3563 for (x = startx;x < endx;x++)
3565 __m128i color, lightmap, pix;
3566 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3569 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3570 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3571 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3572 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3573 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3574 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3575 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3581 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3582 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3583 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3584 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3587 if (pixel == buffer_FragColorbgra8)
3588 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3593 void DPSOFTRAST_VertexShader_LightDirection(void);
3594 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3596 void DPSOFTRAST_VertexShader_FakeLight(void)
3598 DPSOFTRAST_VertexShader_LightDirection();
3601 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3603 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3608 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3610 DPSOFTRAST_VertexShader_LightDirection();
3611 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3614 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3616 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3621 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3623 DPSOFTRAST_VertexShader_LightDirection();
3624 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3627 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3629 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3634 void DPSOFTRAST_VertexShader_LightDirection(void)
3637 int numvertices = dpsoftrast.numvertices;
3639 float LightVector[4];
3640 float EyePosition[4];
3641 float EyeVectorModelSpace[4];
3647 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3648 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3649 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3650 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3651 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3652 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3653 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3654 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3655 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3656 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3657 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3658 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3659 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3660 for (i = 0;i < numvertices;i++)
3662 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3663 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3664 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3665 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3666 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3667 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3668 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3669 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3670 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3671 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3672 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3673 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3674 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3675 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3676 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3677 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3678 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3679 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3680 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3681 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3682 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3683 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3684 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3685 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3686 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3687 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3688 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3689 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3690 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3692 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3695 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3696 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3697 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3698 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3699 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3700 #define DPSOFTRAST_Vector3Normalize(v)\
3703 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3714 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3716 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3717 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3718 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3719 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3720 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3721 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3722 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3723 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3724 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3725 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3726 int x, startx = span->startx, endx = span->endx;
3727 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3728 float LightVectordata[4];
3729 float LightVectorslope[4];
3730 float EyeVectordata[4];
3731 float EyeVectorslope[4];
3732 float VectorSdata[4];
3733 float VectorSslope[4];
3734 float VectorTdata[4];
3735 float VectorTslope[4];
3736 float VectorRdata[4];
3737 float VectorRslope[4];
3739 float diffusetex[4];
3741 float surfacenormal[4];
3742 float lightnormal[4];
3743 float lightnormal_modelspace[4];
3745 float specularnormal[4];
3748 float SpecularPower;
3750 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3751 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3752 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3753 Color_Glow[3] = 0.0f;
3754 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3755 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3756 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3757 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3758 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3759 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3760 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3761 Color_Pants[3] = 0.0f;
3762 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3763 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3764 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3765 Color_Shirt[3] = 0.0f;
3766 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3767 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3768 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3770 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3771 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3773 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3775 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3777 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3779 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3780 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3781 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3782 Color_Diffuse[3] = 0.0f;
3783 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3784 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3785 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3786 LightColor[3] = 0.0f;
3787 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3788 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3789 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3790 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3791 Color_Specular[3] = 0.0f;
3792 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3793 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3794 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3796 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3798 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3799 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3800 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3801 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3802 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3804 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3806 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3807 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3809 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3811 // nothing of this needed
3815 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3818 for (x = startx;x < endx;x++)
3821 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3822 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3823 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3824 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3825 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3827 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3828 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3829 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3830 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3832 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3833 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3834 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3835 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3836 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3837 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3838 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3839 DPSOFTRAST_Vector3Normalize(surfacenormal);
3841 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3843 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3844 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3845 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3846 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3848 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3849 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3850 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3851 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3853 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3854 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3855 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3856 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3858 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3859 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3860 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3861 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3863 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3864 DPSOFTRAST_Vector3Normalize(lightnormal);
3866 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3868 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3869 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3870 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3871 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3874 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3876 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3877 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3878 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3880 float f = 1.0f / 256.0f;
3881 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3882 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3883 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3886 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3888 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3889 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3890 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3891 DPSOFTRAST_Vector3Normalize(lightnormal);
3893 LightColor[0] = 1.0;
3894 LightColor[1] = 1.0;
3895 LightColor[2] = 1.0;
3899 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3900 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3901 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3902 DPSOFTRAST_Vector3Normalize(lightnormal);
3905 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3907 if(thread->shader_exactspecularmath)
3909 // reflect lightnormal at surfacenormal, take the negative of that
3910 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3912 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3913 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3914 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3915 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3917 // dot of this and normalize(EyeVectorFogDepth.xyz)
3918 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921 DPSOFTRAST_Vector3Normalize(eyenormal);
3923 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3927 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3928 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3929 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3930 DPSOFTRAST_Vector3Normalize(eyenormal);
3932 specularnormal[0] = lightnormal[0] + eyenormal[0];
3933 specularnormal[1] = lightnormal[1] + eyenormal[1];
3934 specularnormal[2] = lightnormal[2] + eyenormal[2];
3935 DPSOFTRAST_Vector3Normalize(specularnormal);
3937 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3940 specular = pow(specular, SpecularPower * glosstex[3]);
3941 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3943 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3944 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3945 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3946 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3950 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3951 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3952 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3953 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3956 buffer_FragColorbgra8[x*4+0] = d[0];
3957 buffer_FragColorbgra8[x*4+1] = d[1];
3958 buffer_FragColorbgra8[x*4+2] = d[2];
3959 buffer_FragColorbgra8[x*4+3] = d[3];
3962 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3964 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3965 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3966 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3967 Color_Diffuse[3] = 0.0f;
3968 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3969 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3970 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3971 LightColor[3] = 0.0f;
3972 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3974 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3976 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3977 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3978 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3979 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3980 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3982 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3984 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3985 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3987 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3989 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3993 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3996 for (x = startx;x < endx;x++)
3999 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4000 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4001 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4002 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4003 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4004 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4005 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4006 DPSOFTRAST_Vector3Normalize(surfacenormal);
4008 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4010 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4011 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4012 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4013 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4015 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4016 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4017 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4018 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4020 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4021 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4022 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4023 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4025 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4026 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4027 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4028 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4030 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4031 DPSOFTRAST_Vector3Normalize(lightnormal);
4033 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4035 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4036 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4037 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4038 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4041 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4043 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4044 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4045 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4047 float f = 1.0f / 256.0f;
4048 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4049 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4050 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4053 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4055 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4056 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4057 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4058 DPSOFTRAST_Vector3Normalize(lightnormal);
4060 LightColor[0] = 1.0;
4061 LightColor[1] = 1.0;
4062 LightColor[2] = 1.0;
4066 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4067 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4068 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4069 DPSOFTRAST_Vector3Normalize(lightnormal);
4072 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4073 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4075 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4076 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4077 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4078 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4082 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4083 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4084 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4085 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4087 buffer_FragColorbgra8[x*4+0] = d[0];
4088 buffer_FragColorbgra8[x*4+1] = d[1];
4089 buffer_FragColorbgra8[x*4+2] = d[2];
4090 buffer_FragColorbgra8[x*4+3] = d[3];
4095 for (x = startx;x < endx;x++)
4098 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4099 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4100 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4101 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4103 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4105 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4106 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4107 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4108 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4112 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4113 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4114 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4115 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4117 buffer_FragColorbgra8[x*4+0] = d[0];
4118 buffer_FragColorbgra8[x*4+1] = d[1];
4119 buffer_FragColorbgra8[x*4+2] = d[2];
4120 buffer_FragColorbgra8[x*4+3] = d[3];
4123 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4128 void DPSOFTRAST_VertexShader_LightSource(void)
4131 int numvertices = dpsoftrast.numvertices;
4132 float LightPosition[4];
4133 float LightVector[4];
4134 float LightVectorModelSpace[4];
4135 float EyePosition[4];
4136 float EyeVectorModelSpace[4];
4142 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4143 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4144 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4145 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4146 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4147 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4148 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4149 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4150 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4151 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4152 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4153 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4154 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4155 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4156 for (i = 0;i < numvertices;i++)
4158 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4159 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4160 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4161 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4162 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4163 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4164 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4165 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4166 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4167 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4168 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4169 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4170 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4171 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4172 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4173 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4174 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4175 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4176 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4177 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4178 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4179 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4180 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4181 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4182 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4183 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4184 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4185 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4186 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4187 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4188 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4189 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4191 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4192 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4195 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4198 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4201 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4202 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4203 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4204 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4205 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4206 int x, startx = span->startx, endx = span->endx;
4207 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4208 float CubeVectordata[4];
4209 float CubeVectorslope[4];
4210 float LightVectordata[4];
4211 float LightVectorslope[4];
4212 float EyeVectordata[4];
4213 float EyeVectorslope[4];
4215 float diffusetex[4];
4217 float surfacenormal[4];
4218 float lightnormal[4];
4220 float specularnormal[4];
4223 float SpecularPower;
4224 float CubeVector[4];
4227 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4228 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4229 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4230 Color_Glow[3] = 0.0f;
4231 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4232 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4233 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4234 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4235 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4236 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4237 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4238 Color_Diffuse[3] = 0.0f;
4239 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4240 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4241 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4242 Color_Specular[3] = 0.0f;
4243 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4244 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4245 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4246 Color_Pants[3] = 0.0f;
4247 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4248 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4249 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4250 Color_Shirt[3] = 0.0f;
4251 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4252 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4253 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4254 LightColor[3] = 0.0f;
4255 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4256 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4257 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4258 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4259 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4260 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4261 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4262 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4264 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4265 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4267 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4268 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4269 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4271 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4272 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4273 for (x = startx;x < endx;x++)
4276 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4277 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4278 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4279 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4280 if (attenuation < 0.01f)
4282 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4284 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4285 if (attenuation < 0.01f)
4289 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4290 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4291 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4292 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4293 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4295 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4296 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4297 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4298 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4300 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4301 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4302 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4303 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4304 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4305 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4306 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4307 DPSOFTRAST_Vector3Normalize(surfacenormal);
4309 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4310 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4311 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4312 DPSOFTRAST_Vector3Normalize(lightnormal);
4314 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4316 if(thread->shader_exactspecularmath)
4318 // reflect lightnormal at surfacenormal, take the negative of that
4319 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4321 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4322 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4323 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4324 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4326 // dot of this and normalize(EyeVectorFogDepth.xyz)
4327 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4328 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4329 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4330 DPSOFTRAST_Vector3Normalize(eyenormal);
4332 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4336 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4337 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4338 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4339 DPSOFTRAST_Vector3Normalize(eyenormal);
4341 specularnormal[0] = lightnormal[0] + eyenormal[0];
4342 specularnormal[1] = lightnormal[1] + eyenormal[1];
4343 specularnormal[2] = lightnormal[2] + eyenormal[2];
4344 DPSOFTRAST_Vector3Normalize(specularnormal);
4346 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4348 specular = pow(specular, SpecularPower * glosstex[3]);
4350 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4352 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4353 attenuation *= (1.0f / 255.0f);
4354 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4355 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4356 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4357 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4361 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4362 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4363 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4364 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4366 buffer_FragColorbgra8[x*4+0] = d[0];
4367 buffer_FragColorbgra8[x*4+1] = d[1];
4368 buffer_FragColorbgra8[x*4+2] = d[2];
4369 buffer_FragColorbgra8[x*4+3] = d[3];
4372 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4374 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4375 for (x = startx;x < endx;x++)
4378 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4379 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4380 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4381 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4382 if (attenuation < 0.01f)
4384 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4386 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4387 if (attenuation < 0.01f)
4391 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4392 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4393 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4394 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4395 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4397 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4398 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4399 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4400 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4402 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4403 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4404 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4405 DPSOFTRAST_Vector3Normalize(surfacenormal);
4407 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4408 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4409 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4410 DPSOFTRAST_Vector3Normalize(lightnormal);
4412 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4413 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4415 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4416 attenuation *= (1.0f / 255.0f);
4417 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4418 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4419 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4420 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4424 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4425 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4426 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4427 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4429 buffer_FragColorbgra8[x*4+0] = d[0];
4430 buffer_FragColorbgra8[x*4+1] = d[1];
4431 buffer_FragColorbgra8[x*4+2] = d[2];
4432 buffer_FragColorbgra8[x*4+3] = d[3];
4437 for (x = startx;x < endx;x++)
4440 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4441 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4442 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4443 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4444 if (attenuation < 0.01f)
4446 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4448 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4449 if (attenuation < 0.01f)
4453 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4454 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4455 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4456 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4457 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4459 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4460 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4461 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4462 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4464 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4466 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4467 attenuation *= (1.0f / 255.0f);
4468 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4469 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4470 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4471 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4475 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4476 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4477 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4478 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4480 buffer_FragColorbgra8[x*4+0] = d[0];
4481 buffer_FragColorbgra8[x*4+1] = d[1];
4482 buffer_FragColorbgra8[x*4+2] = d[2];
4483 buffer_FragColorbgra8[x*4+3] = d[3];
4486 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4492 void DPSOFTRAST_VertexShader_Refraction(void)
4494 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4495 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4496 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4499 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4501 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4503 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4505 int x, startx = span->startx, endx = span->endx;
4508 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4509 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4512 float ModelViewProjectionPositiondata[4];
4513 float ModelViewProjectionPositionslope[4];
4516 float ScreenScaleRefractReflect[2];
4517 float ScreenCenterRefractReflect[2];
4518 float DistortScaleRefractReflect[2];
4519 float RefractColor[4];
4521 const unsigned char * RESTRICT pixelbase;
4522 const unsigned char * RESTRICT pixel[4];
4523 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4524 if(!texture) return;
4525 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4528 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4529 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4532 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4535 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4536 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4537 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4538 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4539 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4540 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4541 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4542 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4543 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4544 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4547 for (x = startx;x < endx;x++)
4549 float SafeScreenTexCoord[2];
4550 float ScreenTexCoord[2];
4557 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4558 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4560 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4561 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4562 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4564 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4565 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4566 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4567 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4568 DPSOFTRAST_Vector3Normalize(v);
4569 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4570 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4572 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4573 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4575 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4576 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4577 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4578 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4579 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4580 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4581 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4582 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4583 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4584 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4585 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4586 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4587 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4588 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4589 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4590 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4591 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4595 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4596 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4597 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4598 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4604 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4605 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4606 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4607 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4608 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4611 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4616 void DPSOFTRAST_VertexShader_Water(void)
4618 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4622 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4625 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4626 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4627 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4628 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4629 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4634 void DPSOFTRAST_VertexShader_ShowDepth(void)
4636 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4639 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4642 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4643 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4644 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4645 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4646 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4651 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4653 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4656 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4659 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4660 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4661 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4662 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4663 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4668 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4670 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4673 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4676 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4677 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4678 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4679 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4680 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4685 typedef struct DPSOFTRAST_ShaderModeInfo_s
4688 void (*Vertex)(void);
4689 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4690 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4691 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4693 DPSOFTRAST_ShaderModeInfo;
4695 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4697 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4698 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4699 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4700 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4701 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4702 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4703 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4704 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4705 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4706 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4707 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4708 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4709 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4710 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4711 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4712 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4715 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4722 // unsigned int *colorpixel;
4723 unsigned int *depthpixel;
4729 DPSOFTRAST_State_Triangle *triangle;
4730 DPSOFTRAST_State_Span *span;
4731 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4732 for (i = 0; i < thread->numspans; i++)
4734 span = &thread->spans[i];
4735 triangle = &thread->triangles[span->triangle];
4736 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4738 wslope = triangle->w[0];
4739 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4740 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4741 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4742 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4743 startx = span->startx;
4745 switch(thread->fb_depthfunc)
4748 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4749 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4750 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4751 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4752 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4753 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4754 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4756 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4757 //for (x = startx;x < endx;x++)
4758 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4759 // if there is no color buffer, skip pixel shader
4760 while (startx < endx && !pixelmask[startx])
4762 while (endx > startx && !pixelmask[endx-1])
4765 continue; // no pixels to fill
4766 span->pixelmask = pixelmask;
4767 span->startx = startx;
4769 // run pixel shader if appropriate
4770 // do this before running depthmask code, to allow the pixelshader
4771 // to clear pixelmask values for alpha testing
4772 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4773 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4774 if (thread->depthmask)
4775 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4781 // no depth testing means we're just dealing with color...
4782 // if there is no color buffer, skip pixel shader
4783 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4785 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4786 span->pixelmask = pixelmask;
4787 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4791 thread->numspans = 0;
4794 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4796 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4799 int cullface = thread->cullface;
4800 int minx, maxx, miny, maxy;
4801 int miny1, maxy1, miny2, maxy2;
4802 __m128i fbmin, fbmax;
4803 __m128 viewportcenter, viewportscale;
4804 int firstvertex = command->firstvertex;
4805 int numvertices = command->numvertices;
4806 int numtriangles = command->numtriangles;
4807 const int *element3i = command->element3i;
4808 const unsigned short *element3s = command->element3s;
4809 int clipped = command->clipped;
4816 int starty, endy, bandy;
4820 float clip0origin, clip0slope;
4822 __m128 triangleedge1, triangleedge2, trianglenormal;
4825 DPSOFTRAST_State_Triangle *triangle;
4826 DPSOFTRAST_Texture *texture;
4827 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4828 miny = thread->fb_scissor[1];
4829 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4830 miny1 = bound(miny, thread->miny1, maxy);
4831 maxy1 = bound(miny, thread->maxy1, maxy);
4832 miny2 = bound(miny, thread->miny2, maxy);
4833 maxy2 = bound(miny, thread->maxy2, maxy);
4834 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4836 if (!ATOMIC_DECREMENT(command->refcount))
4838 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4839 MM_FREE(command->arrays);
4843 minx = thread->fb_scissor[0];
4844 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4845 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4846 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4847 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4848 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4849 screen[3] = _mm_setzero_ps();
4850 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4851 for (i = 0;i < numtriangles;i++)
4853 const float *screencoord4f = command->arrays;
4854 const float *arrays = screencoord4f + numvertices*4;
4856 // generate the 3 edges of this triangle
4857 // generate spans for the triangle - switch based on left split or right split classification of triangle
4860 e[0] = element3s[i*3+0] - firstvertex;
4861 e[1] = element3s[i*3+1] - firstvertex;
4862 e[2] = element3s[i*3+2] - firstvertex;
4866 e[0] = element3i[i*3+0] - firstvertex;
4867 e[1] = element3i[i*3+1] - firstvertex;
4868 e[2] = element3i[i*3+2] - firstvertex;
4877 #define SKIPBACKFACE \
4878 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4879 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4880 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4881 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4882 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4886 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4890 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4895 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4896 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4898 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4899 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4901 #define CLIPPEDVERTEXCOPY(k,p1) \
4902 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4904 #define GENATTRIBCOPY(attrib, p1) \
4905 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4906 #define GENATTRIBLERP(attrib, p1, p2) \
4908 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4909 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4911 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4915 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4916 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4917 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4918 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4919 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4920 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4921 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4927 // calculate distance from nearplane
4928 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4929 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4930 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4931 if (clipdist[0] >= 0.0f)
4933 if (clipdist[1] >= 0.0f)
4935 if (clipdist[2] >= 0.0f)
4938 // triangle is entirely in front of nearplane
4939 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4946 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4954 if (clipdist[2] >= 0.0f)
4956 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4963 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4970 else if (clipdist[1] >= 0.0f)
4972 if (clipdist[2] >= 0.0f)
4974 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4981 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4987 else if (clipdist[2] >= 0.0f)
4989 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4994 else continue; // triangle is entirely behind nearplane
4997 // calculate integer y coords for triangle points
4998 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4999 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5000 screenmin = _mm_min_epi16(screeni, screenir),
5001 screenmax = _mm_max_epi16(screeni, screenir);
5002 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5003 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5004 screenmin = _mm_max_epi16(screenmin, fbmin);
5005 screenmax = _mm_min_epi16(screenmax, fbmax);
5006 // skip offscreen triangles
5007 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5009 starty = _mm_extract_epi16(screenmin, 1);
5010 endy = _mm_extract_epi16(screenmax, 1)+1;
5011 if (starty >= maxy1 && endy <= miny2)
5013 screeny = _mm_srai_epi32(screeni, 16);
5016 triangle = &thread->triangles[thread->numtriangles];
5018 // calculate attribute plans for triangle data...
5019 // okay, this triangle is going to produce spans, we'd better project
5020 // the interpolants now (this is what gives perspective texturing),
5021 // this consists of simply multiplying all arrays by the W coord
5022 // (which is basically 1/Z), which will be undone per-pixel
5023 // (multiplying by Z again) to get the perspective-correct array
5026 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5027 __m128 mipedgescale, mipdensity;
5028 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5029 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5030 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5031 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5032 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5033 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5034 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5035 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5036 attribedge1 = _mm_sub_ss(w0, w1);
5037 attribedge2 = _mm_sub_ss(w2, w1);
5038 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5039 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5040 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5041 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5042 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5043 _mm_store_ss(&triangle->w[0], attribxslope);
5044 _mm_store_ss(&triangle->w[1], attribyslope);
5045 _mm_store_ss(&triangle->w[2], attriborigin);
5050 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5052 float cliporigin, clipxslope, clipyslope;
5053 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5054 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5055 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5056 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5057 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5058 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5059 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5060 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5061 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5064 clip0origin = -cliporigin/clipxslope;
5065 clip0slope = -clipyslope/clipxslope;
5066 clip0dir = clipxslope > 0 ? 1 : -1;
5068 else if(clipyslope > 0)
5070 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5071 clip0slope = dpsoftrast.fb_width;
5074 else if(clipyslope < 0)
5076 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5077 clip0slope = -dpsoftrast.fb_width;
5080 else if(clip0origin < 0) continue;
5083 mipedgescale = _mm_setzero_ps();
5084 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5086 __m128 attrib0, attrib1, attrib2;
5087 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5088 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5090 arrays += numvertices*4;
5091 GENATTRIBS(attrib0, attrib1, attrib2);
5092 attriborigin = _mm_mul_ps(attrib1, w1);
5093 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5094 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5095 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5096 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5097 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5098 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5099 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5100 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5101 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5103 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5104 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5105 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5106 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5110 memset(triangle->mip, 0, sizeof(triangle->mip));
5111 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5113 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5114 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5116 texture = thread->texbound[texunit];
5117 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5119 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5120 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5121 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5122 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5123 // this will be multiplied in the texturing routine by the texture resolution
5124 y = _mm_cvtss_si32(mipdensity);
5127 y = (int)(log((float)y)*0.5f/M_LN2);
5128 if (y > texture->mipmaps - 1)
5129 y = texture->mipmaps - 1;
5130 triangle->mip[texunit] = y;
5136 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5139 __m128 xcoords, xslope;
5140 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5141 int yccmask = _mm_movemask_epi8(ycc);
5142 int edge0p, edge0n, edge1p, edge1n;
5150 case 0xFFFF: /*0000*/ y = endy; continue;
5151 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5152 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5153 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5154 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5155 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5156 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5157 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5158 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5159 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5160 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5161 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5162 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5163 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5164 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5165 case 0x0000: /*1111*/ y++; continue;
5173 case 0xFFFF: /*000*/ y = endy; continue;
5174 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5175 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5176 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5177 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5178 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5179 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5180 case 0x0000: /*111*/ y++; continue;
5183 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5184 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5185 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5186 nexty = _mm_extract_epi16(ycc, 0);
5187 if (nexty >= bandy) nexty = bandy-1;
5188 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5189 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5190 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5191 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5192 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5193 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5195 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5196 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5198 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5199 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5201 int startx, endx, clipx = minx, offset;
5202 startx = _mm_cvtss_si32(xcoords);
5203 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5206 if (startx < 0) startx = 0;
5207 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5209 if (endx > maxx) endx = maxx;
5210 if (startx >= endx) continue;
5218 if(endx <= clip0) continue;
5219 clipx = max((int)clip0, minx);
5220 startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5223 else if (endx > clip0)
5225 if(startx >= clip0) continue;
5230 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5232 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5233 span->triangle = thread->numtriangles;
5236 span->startx = max(clipx - offset, 0);
5237 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5238 if (span->startx >= span->endx)
5240 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5241 DPSOFTRAST_Draw_ProcessSpans(thread);
5246 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5248 DPSOFTRAST_Draw_ProcessSpans(thread);
5249 thread->numtriangles = 0;
5253 if (!ATOMIC_DECREMENT(command->refcount))
5255 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5256 MM_FREE(command->arrays);
5259 if (thread->numspans > 0 || thread->numtriangles > 0)
5261 DPSOFTRAST_Draw_ProcessSpans(thread);
5262 thread->numtriangles = 0;
5267 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5271 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5272 int datasize = 2*numvertices*sizeof(float[4]);
5273 DPSOFTRAST_Command_Draw *command;
5274 unsigned char *data;
5275 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5277 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5278 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5280 datasize += numvertices*sizeof(float[4]);
5283 datasize += numtriangles*sizeof(unsigned short[3]);
5285 datasize += numtriangles*sizeof(int[3]);
5286 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5287 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5289 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5290 data = (unsigned char *)MM_CALLOC(datasize, 1);
5294 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5295 data = (unsigned char *)command + commandsize;
5297 command->firstvertex = firstvertex;
5298 command->numvertices = numvertices;
5299 command->numtriangles = numtriangles;
5300 command->arrays = (float *)data;
5301 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5302 dpsoftrast.firstvertex = firstvertex;
5303 dpsoftrast.numvertices = numvertices;
5304 dpsoftrast.screencoord4f = (float *)data;
5305 data += numvertices*sizeof(float[4]);
5306 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5307 data += numvertices*sizeof(float[4]);
5308 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5310 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5311 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5313 dpsoftrast.post_array4f[j] = (float *)data;
5314 data += numvertices*sizeof(float[4]);
5316 command->element3i = NULL;
5317 command->element3s = NULL;
5320 command->element3s = (unsigned short *)data;
5321 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5325 command->element3i = (int *)data;
5326 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5331 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5333 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5334 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5335 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5336 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5337 if (command->starty >= command->endy)
5339 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5340 MM_FREE(command->arrays);
5341 DPSOFTRAST_UndoCommand(command->commandsize);
5344 command->clipped = dpsoftrast.drawclipped;
5345 command->refcount = dpsoftrast.numthreads;
5347 if (dpsoftrast.usethreads)
5350 DPSOFTRAST_Draw_SyncCommands();
5351 for (i = 0; i < dpsoftrast.numthreads; i++)
5353 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5354 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5355 Thread_CondSignal(thread->drawcond);
5360 DPSOFTRAST_Draw_FlushThreads();
5364 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5365 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5367 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5369 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5371 DPSOFTRAST_Command_SetRenderTargets *command;
5372 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5373 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5374 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5376 dpsoftrast.fb_width = width;
5377 dpsoftrast.fb_height = height;
5378 dpsoftrast.fb_depthpixels = depthpixels;
5379 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5380 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5381 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5382 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5383 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5384 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5385 command->width = width;
5386 command->height = height;
5389 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5391 int commandoffset = thread->commandoffset;
5392 while (commandoffset != endoffset)
5394 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5395 switch (command->opcode)
5397 #define INTERPCOMMAND(name) \
5398 case DPSOFTRAST_OPCODE_##name : \
5399 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5400 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5401 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5402 commandoffset = 0; \
5404 INTERPCOMMAND(Viewport)
5405 INTERPCOMMAND(ClearColor)
5406 INTERPCOMMAND(ClearDepth)
5407 INTERPCOMMAND(ColorMask)
5408 INTERPCOMMAND(DepthTest)
5409 INTERPCOMMAND(ScissorTest)
5410 INTERPCOMMAND(Scissor)
5411 INTERPCOMMAND(BlendFunc)
5412 INTERPCOMMAND(BlendSubtract)
5413 INTERPCOMMAND(DepthMask)
5414 INTERPCOMMAND(DepthFunc)
5415 INTERPCOMMAND(DepthRange)
5416 INTERPCOMMAND(PolygonOffset)
5417 INTERPCOMMAND(CullFace)
5418 INTERPCOMMAND(AlphaTest)
5419 INTERPCOMMAND(AlphaFunc)
5420 INTERPCOMMAND(SetTexture)
5421 INTERPCOMMAND(SetShader)
5422 INTERPCOMMAND(Uniform4f)
5423 INTERPCOMMAND(UniformMatrix4f)
5424 INTERPCOMMAND(Uniform1i)
5425 INTERPCOMMAND(SetRenderTargets)
5426 INTERPCOMMAND(ClipPlane)
5428 case DPSOFTRAST_OPCODE_Draw:
5429 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5430 commandoffset += command->commandsize;
5431 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5433 thread->commandoffset = commandoffset;
5436 case DPSOFTRAST_OPCODE_Reset:
5441 thread->commandoffset = commandoffset;
5444 static int DPSOFTRAST_Draw_Thread(void *data)
5446 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5447 while(thread->index >= 0)
5449 if (thread->commandoffset != dpsoftrast.drawcommand)
5451 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5455 Thread_LockMutex(thread->drawmutex);
5456 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5458 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5459 thread->starving = true;
5460 Thread_CondWait(thread->drawcond, thread->drawmutex);
5461 thread->starving = false;
5463 Thread_UnlockMutex(thread->drawmutex);
5469 static void DPSOFTRAST_Draw_FlushThreads(void)
5471 DPSOFTRAST_State_Thread *thread;
5473 DPSOFTRAST_Draw_SyncCommands();
5474 if (dpsoftrast.usethreads)
5476 for (i = 0; i < dpsoftrast.numthreads; i++)
5478 thread = &dpsoftrast.threads[i];
5479 if (thread->commandoffset != dpsoftrast.drawcommand)
5481 Thread_LockMutex(thread->drawmutex);
5482 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5483 Thread_CondSignal(thread->drawcond);
5484 Thread_UnlockMutex(thread->drawmutex);
5487 for (i = 0; i < dpsoftrast.numthreads; i++)
5489 thread = &dpsoftrast.threads[i];
5490 if (thread->commandoffset != dpsoftrast.drawcommand)
5492 Thread_LockMutex(thread->drawmutex);
5493 if (thread->commandoffset != dpsoftrast.drawcommand)
5495 thread->waiting = true;
5496 Thread_CondWait(thread->waitcond, thread->drawmutex);
5497 thread->waiting = false;
5499 Thread_UnlockMutex(thread->drawmutex);
5505 for (i = 0; i < dpsoftrast.numthreads; i++)
5507 thread = &dpsoftrast.threads[i];
5508 if (thread->commandoffset != dpsoftrast.drawcommand)
5509 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5512 dpsoftrast.commandpool.usedcommands = 0;
5515 void DPSOFTRAST_Flush(void)
5517 DPSOFTRAST_Draw_FlushThreads();
5520 void DPSOFTRAST_Finish(void)
5525 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5535 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5536 dpsoftrast.bigendian = u.b[3];
5537 dpsoftrast.fb_width = width;
5538 dpsoftrast.fb_height = height;
5539 dpsoftrast.fb_depthpixels = depthpixels;
5540 dpsoftrast.fb_colorpixels[0] = colorpixels;
5541 dpsoftrast.fb_colorpixels[1] = NULL;
5542 dpsoftrast.fb_colorpixels[1] = NULL;
5543 dpsoftrast.fb_colorpixels[1] = NULL;
5544 dpsoftrast.viewport[0] = 0;
5545 dpsoftrast.viewport[1] = 0;
5546 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5547 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5548 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5549 dpsoftrast.texture_firstfree = 1;
5550 dpsoftrast.texture_end = 1;
5551 dpsoftrast.texture_max = 0;
5552 dpsoftrast.color[0] = 1;
5553 dpsoftrast.color[1] = 1;
5554 dpsoftrast.color[2] = 1;
5555 dpsoftrast.color[3] = 1;
5556 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5557 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5558 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5559 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5560 for (i = 0; i < dpsoftrast.numthreads; i++)
5562 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5564 thread->cullface = GL_BACK;
5565 thread->colormask[1] = 1;
5566 thread->colormask[2] = 1;
5567 thread->colormask[3] = 1;
5568 thread->blendfunc[0] = GL_ONE;
5569 thread->blendfunc[1] = GL_ZERO;
5570 thread->depthmask = true;
5571 thread->depthtest = true;
5572 thread->depthfunc = GL_LEQUAL;
5573 thread->scissortest = false;
5574 thread->alphatest = false;
5575 thread->alphafunc = GL_GREATER;
5576 thread->alphavalue = 0.5f;
5577 thread->viewport[0] = 0;
5578 thread->viewport[1] = 0;
5579 thread->viewport[2] = dpsoftrast.fb_width;
5580 thread->viewport[3] = dpsoftrast.fb_height;
5581 thread->scissor[0] = 0;
5582 thread->scissor[1] = 0;
5583 thread->scissor[2] = dpsoftrast.fb_width;
5584 thread->scissor[3] = dpsoftrast.fb_height;
5585 thread->depthrange[0] = 0;
5586 thread->depthrange[1] = 1;
5587 thread->polygonoffset[0] = 0;
5588 thread->polygonoffset[1] = 0;
5589 thread->clipplane[0] = 0;
5590 thread->clipplane[1] = 0;
5591 thread->clipplane[2] = 0;
5592 thread->clipplane[3] = 1;
5594 thread->numspans = 0;
5595 thread->numtriangles = 0;
5596 thread->commandoffset = 0;
5597 thread->waiting = false;
5598 thread->starving = false;
5600 thread->validate = -1;
5601 DPSOFTRAST_Validate(thread, -1);
5603 if (dpsoftrast.usethreads)
5605 thread->waitcond = Thread_CreateCond();
5606 thread->drawcond = Thread_CreateCond();
5607 thread->drawmutex = Thread_CreateMutex();
5608 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5614 void DPSOFTRAST_Shutdown(void)
5617 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5619 DPSOFTRAST_State_Thread *thread;
5620 for (i = 0; i < dpsoftrast.numthreads; i++)
5622 thread = &dpsoftrast.threads[i];
5623 Thread_LockMutex(thread->drawmutex);
5625 Thread_CondSignal(thread->drawcond);
5626 Thread_UnlockMutex(thread->drawmutex);
5627 Thread_WaitThread(thread->thread, 0);
5628 Thread_DestroyCond(thread->waitcond);
5629 Thread_DestroyCond(thread->drawcond);
5630 Thread_DestroyMutex(thread->drawmutex);
5633 for (i = 0;i < dpsoftrast.texture_end;i++)
5634 if (dpsoftrast.texture[i].bytes)
5635 MM_FREE(dpsoftrast.texture[i].bytes);
5636 if (dpsoftrast.texture)
5637 free(dpsoftrast.texture);
5638 if (dpsoftrast.threads)
5639 MM_FREE(dpsoftrast.threads);
5640 memset(&dpsoftrast, 0, sizeof(dpsoftrast));