3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
238 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243 // DPSOFTRAST_VALIDATE_ flags
246 // derived values (DPSOFTRAST_VALIDATE_FB)
249 ALIGN(float fb_viewportcenter[4]);
250 ALIGN(float fb_viewportscale[4]);
252 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
255 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
264 ATOMIC(volatile int commandoffset);
266 volatile bool waiting;
267 volatile bool starving;
274 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 DPSOFTRAST_State_Thread);
279 typedef ATOMIC(struct DPSOFTRAST_State_s
283 unsigned int *fb_depthpixels;
284 unsigned int *fb_colorpixels[4];
287 ALIGN(float fb_viewportcenter[4]);
288 ALIGN(float fb_viewportscale[4]);
291 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294 const float *pointer_vertex3f;
295 const float *pointer_color4f;
296 const unsigned char *pointer_color4ub;
297 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
306 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307 float *screencoord4f;
313 int shader_permutation;
317 int texture_firstfree;
318 DPSOFTRAST_Texture *texture;
323 const char *errorstring;
328 DPSOFTRAST_State_Thread *threads;
330 ATOMIC(volatile int drawcommand);
332 DPSOFTRAST_State_Command_Pool commandpool;
336 DPSOFTRAST_State dpsoftrast;
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
346 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348 fb_viewportcenter[3] = 0.5f;
349 fb_viewportcenter[0] = 0.0f;
350 fb_viewportscale[1] = 0.5f * viewport[2];
351 fb_viewportscale[2] = -0.5f * viewport[3];
352 fb_viewportscale[3] = 0.5f;
353 fb_viewportscale[0] = 1.0f;
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
358 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359 // and viewport projection values
362 x1 = thread->scissor[0];
363 x2 = thread->scissor[0] + thread->scissor[2];
364 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365 y2 = dpsoftrast.fb_height - thread->scissor[1];
366 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
368 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
370 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371 thread->fb_scissor[0] = x1;
372 thread->fb_scissor[1] = y1;
373 thread->fb_scissor[2] = x2 - x1;
374 thread->fb_scissor[3] = y2 - y1;
376 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
381 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
386 if (thread->blendsubtract)
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
398 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
400 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
419 mask &= thread->validate;
422 if (mask & DPSOFTRAST_VALIDATE_FB)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425 DPSOFTRAST_RecalcFB(thread);
427 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
429 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430 DPSOFTRAST_RecalcDepthFunc(thread);
432 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
434 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435 DPSOFTRAST_RecalcBlendFunc(thread);
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
441 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442 return &dpsoftrast.texture[index];
446 static void DPSOFTRAST_Texture_Grow(void)
448 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449 DPSOFTRAST_State_Thread *thread;
453 // expand texture array as needed
454 if (dpsoftrast.texture_max < 1024)
455 dpsoftrast.texture_max = 1024;
457 dpsoftrast.texture_max *= 2;
458 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460 if (dpsoftrast.texbound[i])
461 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462 for (j = 0; j < dpsoftrast.numthreads; j++)
464 thread = &dpsoftrast.threads[j];
465 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466 if (thread->texbound[i])
467 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
480 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482 DPSOFTRAST_Texture *texture;
483 if (width*height*depth < 1)
485 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
490 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
495 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
534 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
539 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542 // find first empty slot in texture array
543 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544 if (!dpsoftrast.texture[texnum].bytes)
546 dpsoftrast.texture_firstfree = texnum + 1;
547 if (dpsoftrast.texture_max <= texnum)
548 DPSOFTRAST_Texture_Grow();
549 if (dpsoftrast.texture_end <= texnum)
550 dpsoftrast.texture_end = texnum + 1;
551 texture = &dpsoftrast.texture[texnum];
552 memset(texture, 0, sizeof(*texture));
553 texture->flags = flags;
554 texture->width = width;
555 texture->height = height;
556 texture->depth = depth;
557 texture->sides = sides;
569 s = w * h * d * sides * 4;
570 texture->mipmap[mipmaps][0] = size;
571 texture->mipmap[mipmaps][1] = s;
572 texture->mipmap[mipmaps][2] = w;
573 texture->mipmap[mipmaps][3] = h;
574 texture->mipmap[mipmaps][4] = d;
577 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 texture->mipmaps = mipmaps;
584 texture->size = size;
586 // allocate the pixels now
587 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
591 void DPSOFTRAST_Texture_Free(int index)
593 DPSOFTRAST_Texture *texture;
594 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
598 MM_FREE(texture->bytes);
599 texture->bytes = NULL;
600 memset(texture, 0, sizeof(*texture));
601 // adjust the free range and used range
602 if (dpsoftrast.texture_firstfree > index)
603 dpsoftrast.texture_firstfree = index;
604 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605 dpsoftrast.texture_end--;
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
609 int i, x, y, z, w, layer0, layer1, row0, row1;
610 unsigned char *o, *i0, *i1, *i2, *i3;
611 DPSOFTRAST_Texture *texture;
612 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613 if (texture->mipmaps <= 1)
615 for (i = 1;i < texture->mipmaps;i++)
617 for (z = 0;z < texture->mipmap[i][4];z++)
621 if (layer1 >= texture->mipmap[i-1][4])
622 layer1 = texture->mipmap[i-1][4]-1;
623 for (y = 0;y < texture->mipmap[i][3];y++)
627 if (row1 >= texture->mipmap[i-1][3])
628 row1 = texture->mipmap[i-1][3]-1;
629 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
630 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634 w = texture->mipmap[i][2];
637 if (texture->mipmap[i-1][2] > 1)
639 // average 3D texture
640 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
642 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
650 // average 3D mipmap with parent width == 1
651 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
653 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
662 if (texture->mipmap[i-1][2] > 1)
664 // average 2D texture (common case)
665 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
667 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
675 // 2D texture with parent width == 1
676 o[0] = (i0[0] + i1[0] + 1) >> 1;
677 o[1] = (i0[1] + i1[1] + 1) >> 1;
678 o[2] = (i0[2] + i1[2] + 1) >> 1;
679 o[3] = (i0[3] + i1[3] + 1) >> 1;
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
688 DPSOFTRAST_Texture *texture;
690 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694 while (blockheight > 0)
696 memcpy(dst, pixels, blockwidth * 4);
697 pixels += blockwidth * 4;
698 dst += texture->mipmap[0][2] * 4;
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
709 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710 DPSOFTRAST_Texture_CalculateMipmaps(index);
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
714 DPSOFTRAST_Texture *texture;
715 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716 return texture->mipmap[mip][2];
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
720 DPSOFTRAST_Texture *texture;
721 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->mipmap[mip][3];
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728 return texture->mipmap[mip][4];
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
736 return texture->bytes + texture->mipmap[mip][0];
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
740 DPSOFTRAST_Texture *texture;
741 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
744 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
749 texture->filter = filter;
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
754 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
758 dpsoftrast.fb_width = width;
759 dpsoftrast.fb_height = height;
760 dpsoftrast.fb_depthpixels = depthpixels;
761 dpsoftrast.fb_colorpixels[0] = colorpixels0;
762 dpsoftrast.fb_colorpixels[1] = colorpixels1;
763 dpsoftrast.fb_colorpixels[2] = colorpixels2;
764 dpsoftrast.fb_colorpixels[3] = colorpixels3;
767 static void DPSOFTRAST_Draw_FlushThreads(void);
769 static void DPSOFTRAST_Draw_SyncCommands(void)
771 if(dpsoftrast.usethreads) MEMORY_BARRIER;
772 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
777 DPSOFTRAST_State_Thread *thread;
779 int freecommand = dpsoftrast.commandpool.freecommand;
780 int usedcommands = dpsoftrast.commandpool.usedcommands;
781 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
783 DPSOFTRAST_Draw_SyncCommands();
789 for (i = 0; i < dpsoftrast.numthreads; i++)
791 thread = &dpsoftrast.threads[i];
792 commandoffset = freecommand - thread->commandoffset;
793 if (commandoffset < 0)
794 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795 if (commandoffset > usedcommands)
798 usedcommands = commandoffset;
801 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
803 thread = &dpsoftrast.threads[waitindex];
804 Thread_LockMutex(thread->drawmutex);
805 if (thread->commandoffset != dpsoftrast.drawcommand)
807 thread->waiting = true;
808 if (thread->starving) Thread_CondSignal(thread->drawcond);
809 Thread_CondWait(thread->waitcond, thread->drawmutex);
810 thread->waiting = false;
812 Thread_UnlockMutex(thread->drawmutex);
814 dpsoftrast.commandpool.usedcommands = usedcommands;
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 DPSOFTRAST_Command *command;
825 int freecommand = dpsoftrast.commandpool.freecommand;
826 int usedcommands = dpsoftrast.commandpool.usedcommands;
827 int extra = sizeof(DPSOFTRAST_Command);
828 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832 if (dpsoftrast.usethreads)
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
835 DPSOFTRAST_Draw_FlushThreads();
836 freecommand = dpsoftrast.commandpool.freecommand;
837 usedcommands = dpsoftrast.commandpool.usedcommands;
839 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842 command->opcode = DPSOFTRAST_OPCODE_Reset;
843 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
846 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847 command->opcode = opcode;
848 command->commandsize = size;
850 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
852 dpsoftrast.commandpool.freecommand = freecommand;
853 dpsoftrast.commandpool.usedcommands = usedcommands + size;
857 static void DPSOFTRAST_UndoCommand(int size)
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
863 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864 usedcommands -= size;
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands;
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
872 thread->viewport[0] = command->x;
873 thread->viewport[1] = command->y;
874 thread->viewport[2] = command->width;
875 thread->viewport[3] = command->height;
876 thread->validate |= DPSOFTRAST_VALIDATE_FB;
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
880 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
883 command->width = width;
884 command->height = height;
886 dpsoftrast.viewport[0] = x;
887 dpsoftrast.viewport[1] = y;
888 dpsoftrast.viewport[2] = width;
889 dpsoftrast.viewport[3] = height;
890 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 int i, x1, y1, x2, y2, w, h, x, y;
897 int miny1 = thread->miny1;
898 int maxy1 = thread->maxy1;
899 int miny2 = thread->miny2;
900 int maxy2 = thread->maxy2;
904 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905 x1 = thread->fb_scissor[0];
906 y1 = thread->fb_scissor[1];
907 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909 if (y1 < miny1) y1 = miny1;
910 if (y2 > maxy2) y2 = maxy2;
915 // FIXME: honor fb_colormask?
916 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917 for (i = 0;i < 4;i++)
919 if (!dpsoftrast.fb_colorpixels[i])
921 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
924 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925 for (x = x1;x < x2;x++)
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
932 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
942 int x1, y1, x2, y2, w, h, x, y;
943 int miny1 = thread->miny1;
944 int maxy1 = thread->maxy1;
945 int miny2 = thread->miny2;
946 int maxy2 = thread->maxy2;
950 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
965 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966 for (x = x1;x < x2;x++)
970 void DPSOFTRAST_ClearDepth(float d)
972 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
979 thread->colormask[0] = command->r != 0;
980 thread->colormask[1] = command->g != 0;
981 thread->colormask[2] = command->b != 0;
982 thread->colormask[3] = command->a != 0;
983 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
987 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
997 thread->depthtest = command->enable;
998 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1000 void DPSOFTRAST_DepthTest(int enable)
1002 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003 command->enable = enable;
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1009 thread->scissortest = command->enable;
1010 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1012 void DPSOFTRAST_ScissorTest(int enable)
1014 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015 command->enable = enable;
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1021 thread->scissor[0] = command->x;
1022 thread->scissor[1] = command->y;
1023 thread->scissor[2] = command->width;
1024 thread->scissor[3] = command->height;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1029 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1032 command->width = width;
1033 command->height = height;
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1039 thread->blendfunc[0] = command->sfactor;
1040 thread->blendfunc[1] = command->dfactor;
1041 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1045 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046 command->sfactor = sfactor;
1047 command->dfactor = dfactor;
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1053 thread->blendsubtract = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 void DPSOFTRAST_BlendSubtract(int enable)
1058 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059 command->enable = enable;
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 thread->depthmask = command->enable;
1067 void DPSOFTRAST_DepthMask(int enable)
1069 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070 command->enable = enable;
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1076 thread->depthfunc = command->func;
1078 void DPSOFTRAST_DepthFunc(int func)
1080 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081 command->func = func;
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1087 thread->depthrange[0] = command->nearval;
1088 thread->depthrange[1] = command->farval;
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1092 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093 command->nearval = nearval;
1094 command->farval = farval;
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1100 thread->polygonoffset[0] = command->alongnormal;
1101 thread->polygonoffset[1] = command->intoview;
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1105 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106 command->alongnormal = alongnormal;
1107 command->intoview = intoview;
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1113 thread->cullface = command->mode;
1115 void DPSOFTRAST_CullFace(int mode)
1117 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118 command->mode = mode;
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1124 thread->alphatest = command->enable;
1126 void DPSOFTRAST_AlphaTest(int enable)
1128 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129 command->enable = enable;
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1135 thread->alphafunc = command->func;
1136 thread->alphavalue = command->ref;
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1140 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141 command->func = func;
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 dpsoftrast.color[0] = r;
1148 dpsoftrast.color[1] = g;
1149 dpsoftrast.color[2] = b;
1150 dpsoftrast.color[3] = a;
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 int outstride = blockwidth * 4;
1156 int instride = dpsoftrast.fb_width * 4;
1159 int bx2 = blockx + blockwidth;
1160 int by2 = blocky + blockheight;
1164 unsigned char *inpixels;
1168 if (bx1 < 0) bx1 = 0;
1169 if (by1 < 0) by1 = 0;
1170 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1227 spixels = dpsoftrast.fb_colorpixels[0];
1228 swidth = dpsoftrast.fb_width;
1229 sheight = dpsoftrast.fb_height;
1230 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231 twidth = texture->mipmap[mip][2];
1232 theight = texture->mipmap[mip][3];
1233 if (tx1 < 0) tx1 = 0;
1234 if (ty1 < 0) ty1 = 0;
1235 if (tx2 > twidth) tx2 = twidth;
1236 if (ty2 > theight) ty2 = theight;
1237 if (sx1 < 0) sx1 = 0;
1238 if (sy1 < 0) sy1 = 0;
1239 if (sx2 > swidth) sx2 = swidth;
1240 if (sy2 > sheight) sy2 = sheight;
1245 if (tw > sw) tw = sw;
1246 if (th > sh) th = sh;
1247 if (tw < 1 || th < 1)
1249 sy1 = sheight - 1 - sy1;
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = (DPSOFTRAST_UNIFORM)index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1767 float *end = out4f + numitems*4;
1768 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769 __m128 minpos, maxpos;
1770 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1772 minpos = maxpos = _mm_loadu_ps(in4f);
1775 __m128 v = _mm_loadu_ps(in4f);
1776 minpos = _mm_min_ps(minpos, v);
1777 maxpos = _mm_max_ps(maxpos, v);
1778 _mm_store_ps(out4f, v);
1779 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780 _mm_store_ps(screen4f, v);
1788 minpos = maxpos = _mm_load_ps(in4f);
1791 __m128 v = _mm_load_ps(in4f);
1792 minpos = _mm_min_ps(minpos, v);
1793 maxpos = _mm_max_ps(maxpos, v);
1794 _mm_store_ps(out4f, v);
1795 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796 _mm_store_ps(screen4f, v);
1803 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1804 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1813 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1816 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818 end = out4f + numitems*4;
1819 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821 m0 = _mm_loadu_ps(inmatrix16f);
1822 m1 = _mm_loadu_ps(inmatrix16f + 4);
1823 m2 = _mm_loadu_ps(inmatrix16f + 8);
1824 m3 = _mm_loadu_ps(inmatrix16f + 12);
1825 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1827 minpos = maxpos = _mm_loadu_ps(in4f);
1830 __m128 v = _mm_loadu_ps(in4f);
1831 minpos = _mm_min_ps(minpos, v);
1832 maxpos = _mm_max_ps(maxpos, v);
1833 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834 _mm_store_ps(out4f, v);
1835 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836 _mm_store_ps(screen4f, v);
1844 minpos = maxpos = _mm_load_ps(in4f);
1847 __m128 v = _mm_load_ps(in4f);
1848 minpos = _mm_min_ps(minpos, v);
1849 maxpos = _mm_max_ps(maxpos, v);
1850 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851 _mm_store_ps(out4f, v);
1852 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853 _mm_store_ps(screen4f, v);
1860 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1868 float *outf = dpsoftrast.post_array4f[outarray];
1869 const unsigned char *inb;
1870 int firstvertex = dpsoftrast.firstvertex;
1871 int numvertices = dpsoftrast.numvertices;
1875 case DPSOFTRAST_ARRAY_POSITION:
1876 stride = dpsoftrast.stride_vertex;
1877 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1880 case DPSOFTRAST_ARRAY_COLOR:
1881 stride = dpsoftrast.stride_color;
1882 if (dpsoftrast.pointer_color4f)
1884 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1887 else if (dpsoftrast.pointer_color4ub)
1889 stride = dpsoftrast.stride_color;
1890 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1895 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1899 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1902 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1906 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1909 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1935 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1947 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1958 int startx = span->startx;
1959 int endx = span->endx;
1960 float wslope = triangle->w[0];
1961 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962 float endz = 1.0f / (w + wslope * startx);
1963 for (x = startx;x < endx;)
1965 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1967 if (nextsub >= endx) nextsub = endsub = endx-1;
1968 endz = 1.0f / (w + wslope * nextsub);
1969 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970 for (; x <= endsub; x++, z += dz)
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1978 int startx = span->startx;
1979 int endx = span->endx;
1982 unsigned char * RESTRICT pixelmask = span->pixelmask;
1983 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1986 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987 // handle alphatest now (this affects depth writes too)
1988 if (thread->alphatest)
1989 for (x = startx;x < endx;x++)
1990 if (in4f[x*4+3] < 0.5f)
1991 pixelmask[x] = false;
1992 // FIXME: this does not handle bigendian
1993 switch(thread->fb_blendmode)
1995 case DPSOFTRAST_BLENDMODE_OPAQUE:
1996 for (x = startx;x < endx;x++)
2000 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004 pixel[x*4+0] = d[0];
2005 pixel[x*4+1] = d[1];
2006 pixel[x*4+2] = d[2];
2007 pixel[x*4+3] = d[3];
2010 case DPSOFTRAST_BLENDMODE_ALPHA:
2011 for (x = startx;x < endx;x++)
2015 a = in4f[x*4+3] * 255.0f;
2016 b = 1.0f - in4f[x*4+3];
2017 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021 pixel[x*4+0] = d[0];
2022 pixel[x*4+1] = d[1];
2023 pixel[x*4+2] = d[2];
2024 pixel[x*4+3] = d[3];
2027 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028 for (x = startx;x < endx;x++)
2032 a = in4f[x*4+3] * 255.0f;
2033 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037 pixel[x*4+0] = d[0];
2038 pixel[x*4+1] = d[1];
2039 pixel[x*4+2] = d[2];
2040 pixel[x*4+3] = d[3];
2043 case DPSOFTRAST_BLENDMODE_ADD:
2044 for (x = startx;x < endx;x++)
2048 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052 pixel[x*4+0] = d[0];
2053 pixel[x*4+1] = d[1];
2054 pixel[x*4+2] = d[2];
2055 pixel[x*4+3] = d[3];
2058 case DPSOFTRAST_BLENDMODE_INVMOD:
2059 for (x = startx;x < endx;x++)
2063 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067 pixel[x*4+0] = d[0];
2068 pixel[x*4+1] = d[1];
2069 pixel[x*4+2] = d[2];
2070 pixel[x*4+3] = d[3];
2073 case DPSOFTRAST_BLENDMODE_MUL:
2074 for (x = startx;x < endx;x++)
2078 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082 pixel[x*4+0] = d[0];
2083 pixel[x*4+1] = d[1];
2084 pixel[x*4+2] = d[2];
2085 pixel[x*4+3] = d[3];
2088 case DPSOFTRAST_BLENDMODE_MUL2:
2089 for (x = startx;x < endx;x++)
2093 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097 pixel[x*4+0] = d[0];
2098 pixel[x*4+1] = d[1];
2099 pixel[x*4+2] = d[2];
2100 pixel[x*4+3] = d[3];
2103 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104 for (x = startx;x < endx;x++)
2108 a = in4f[x*4+3] * -255.0f;
2109 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113 pixel[x*4+0] = d[0];
2114 pixel[x*4+1] = d[1];
2115 pixel[x*4+2] = d[2];
2116 pixel[x*4+3] = d[3];
2119 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120 for (x = startx;x < endx;x++)
2125 b = 1.0f - in4f[x*4+3];
2126 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130 pixel[x*4+0] = d[0];
2131 pixel[x*4+1] = d[1];
2132 pixel[x*4+2] = d[2];
2133 pixel[x*4+3] = d[3];
2136 case DPSOFTRAST_BLENDMODE_INVADD:
2137 for (x = startx;x < endx;x++)
2141 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145 pixel[x*4+0] = d[0];
2146 pixel[x*4+1] = d[1];
2147 pixel[x*4+2] = d[2];
2148 pixel[x*4+3] = d[3];
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2158 int startx = span->startx;
2159 int endx = span->endx;
2160 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161 unsigned char * RESTRICT pixelmask = span->pixelmask;
2162 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2166 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167 pixeli += span->y * dpsoftrast.fb_width + span->x;
2168 // handle alphatest now (this affects depth writes too)
2169 if (thread->alphatest)
2170 for (x = startx;x < endx;x++)
2171 if (in4ub[x*4+3] < 0.5f)
2172 pixelmask[x] = false;
2173 // FIXME: this does not handle bigendian
2174 switch(thread->fb_blendmode)
2176 case DPSOFTRAST_BLENDMODE_OPAQUE:
2177 for (x = startx;x + 4 <= endx;)
2179 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195 case DPSOFTRAST_BLENDMODE_ALPHA:
2196 #define FINISHBLEND(blend2, blend1) \
2197 for (x = startx;x + 1 < endx;x += 2) \
2200 switch (*(const unsigned short*)&pixelmask[x]) \
2203 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2206 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2209 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2212 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2218 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2223 for(;x < endx; x++) \
2226 if (!pixelmask[x]) \
2228 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2231 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2242 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2244 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251 case DPSOFTRAST_BLENDMODE_ADD:
2252 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2254 case DPSOFTRAST_BLENDMODE_INVMOD:
2256 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 case DPSOFTRAST_BLENDMODE_MUL:
2262 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2264 case DPSOFTRAST_BLENDMODE_MUL2:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2267 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2269 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2278 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2285 case DPSOFTRAST_BLENDMODE_INVADD:
2287 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2299 int startx = span->startx;
2300 int endx = span->endx;
2305 float tc[2], endtc[2];
2307 unsigned int tci[2];
2308 unsigned int tci1[2];
2309 unsigned int tcimin[2];
2310 unsigned int tcimax[2];
2315 const unsigned char * RESTRICT pixelbase;
2316 const unsigned char * RESTRICT pixel[4];
2317 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318 // if no texture is bound, just fill it with white
2321 for (x = startx;x < endx;x++)
2323 out4f[x*4+0] = 1.0f;
2324 out4f[x*4+1] = 1.0f;
2325 out4f[x*4+2] = 1.0f;
2326 out4f[x*4+3] = 1.0f;
2330 mip = triangle->mip[texunitindex];
2331 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332 // if this mipmap of the texture is 1 pixel, just fill it with that color
2333 if (texture->mipmap[mip][1] == 4)
2335 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339 for (x = startx;x < endx;x++)
2341 out4f[x*4+0] = c[0];
2342 out4f[x*4+1] = c[1];
2343 out4f[x*4+2] = c[2];
2344 out4f[x*4+3] = c[3];
2348 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350 flags = texture->flags;
2351 tcscale[0] = texture->mipmap[mip][2];
2352 tcscale[1] = texture->mipmap[mip][3];
2353 tciwidth = texture->mipmap[mip][2];
2356 tcimax[0] = texture->mipmap[mip][2]-1;
2357 tcimax[1] = texture->mipmap[mip][3]-1;
2358 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362 for (x = startx;x < endx;)
2364 unsigned int subtc[2];
2365 unsigned int substep[2];
2366 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368 if (nextsub >= endx)
2370 nextsub = endsub = endx-1;
2371 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2375 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377 substep[0] = (endtc[0] - tc[0]) * subscale;
2378 substep[1] = (endtc[1] - tc[1]) * subscale;
2379 subtc[0] = tc[0] * (1<<16);
2380 subtc[1] = tc[1] * (1<<16);
2383 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2385 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2387 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390 tci[0] = subtc[0]>>16;
2391 tci[1] = subtc[1]>>16;
2392 tci1[0] = tci[0] + 1;
2393 tci1[1] = tci[1] + 1;
2394 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406 out4f[x*4+0] = c[0];
2407 out4f[x*4+1] = c[1];
2408 out4f[x*4+2] = c[2];
2409 out4f[x*4+3] = c[3];
2414 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2416 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419 tci[0] = subtc[0]>>16;
2420 tci[1] = subtc[1]>>16;
2421 tci1[0] = tci[0] + 1;
2422 tci1[1] = tci[1] + 1;
2423 tci[0] &= tciwrapmask[0];
2424 tci[1] &= tciwrapmask[1];
2425 tci1[0] &= tciwrapmask[0];
2426 tci1[1] &= tciwrapmask[1];
2427 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435 out4f[x*4+0] = c[0];
2436 out4f[x*4+1] = c[1];
2437 out4f[x*4+2] = c[2];
2438 out4f[x*4+3] = c[3];
2442 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2444 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2446 tci[0] = subtc[0]>>16;
2447 tci[1] = subtc[1]>>16;
2448 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455 out4f[x*4+0] = c[0];
2456 out4f[x*4+1] = c[1];
2457 out4f[x*4+2] = c[2];
2458 out4f[x*4+3] = c[3];
2463 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2465 tci[0] = subtc[0]>>16;
2466 tci[1] = subtc[1]>>16;
2467 tci[0] &= tciwrapmask[0];
2468 tci[1] &= tciwrapmask[1];
2469 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474 out4f[x*4+0] = c[0];
2475 out4f[x*4+1] = c[1];
2476 out4f[x*4+2] = c[2];
2477 out4f[x*4+3] = c[3];
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2487 int startx = span->startx;
2488 int endx = span->endx;
2490 __m128 data, slope, tcscale;
2491 __m128i tcsize, tcmask, tcoffset, tcmax;
2493 __m128i subtc, substep, endsubtc;
2496 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497 const unsigned char * RESTRICT pixelbase;
2498 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499 // if no texture is bound, just fill it with white
2502 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2505 mip = triangle->mip[texunitindex];
2506 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507 // if this mipmap of the texture is 1 pixel, just fill it with that color
2508 if (texture->mipmap[mip][1] == 4)
2510 unsigned int k = *((const unsigned int *)pixelbase);
2511 for (x = startx;x < endx;x++)
2515 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517 flags = texture->flags;
2518 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520 tcscale = _mm_cvtepi32_ps(tcsize);
2521 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526 tcmax = _mm_packs_epi32(tcmask, tcmask);
2527 for (x = startx;x < endx;)
2529 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531 if (nextsub >= endx)
2533 nextsub = endsub = endx-1;
2534 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2538 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542 substep = _mm_slli_epi32(substep, 1);
2545 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2548 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2551 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2552 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553 tci = _mm_madd_epi16(tci, tcoffset);
2554 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560 fracm = _mm_srli_epi16(subtc, 1);
2561 pix1 = _mm_add_epi16(pix1,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564 pix3 = _mm_add_epi16(pix3,
2565 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569 pix2 = _mm_add_epi16(pix2,
2570 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2576 const unsigned char * RESTRICT ptr1;
2577 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578 tci = _mm_madd_epi16(tci, tcoffset);
2579 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582 fracm = _mm_srli_epi16(subtc, 1);
2583 pix1 = _mm_add_epi16(pix1,
2584 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587 pix1 = _mm_add_epi16(pix1,
2588 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2594 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2596 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2598 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600 tci = _mm_madd_epi16(tci, tcoffset);
2601 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603 _mm_setzero_si128());
2604 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606 _mm_setzero_si128());
2607 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609 tci = _mm_madd_epi16(tci, tcoffset);
2610 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612 _mm_setzero_si128());
2613 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615 _mm_setzero_si128());
2616 fracm = _mm_srli_epi16(subtc, 1);
2617 pix1 = _mm_add_epi16(pix1,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620 pix3 = _mm_add_epi16(pix3,
2621 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625 pix2 = _mm_add_epi16(pix2,
2626 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2632 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634 tci = _mm_madd_epi16(tci, tcoffset);
2635 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637 _mm_setzero_si128());
2638 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640 _mm_setzero_si128());
2641 fracm = _mm_srli_epi16(subtc, 1);
2642 pix1 = _mm_add_epi16(pix1,
2643 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646 pix1 = _mm_add_epi16(pix1,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2655 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2657 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668 tci = _mm_madd_epi16(tci, tcoffset);
2669 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671 _mm_setzero_si128());
2672 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674 _mm_setzero_si128());
2675 fracm = _mm_srli_epi16(subtc, 1);
2676 pix1 = _mm_add_epi16(pix1,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679 pix3 = _mm_add_epi16(pix3,
2680 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684 pix2 = _mm_add_epi16(pix2,
2685 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2691 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693 tci = _mm_madd_epi16(tci, tcoffset);
2694 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696 _mm_setzero_si128());
2697 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699 _mm_setzero_si128());
2700 fracm = _mm_srli_epi16(subtc, 1);
2701 pix1 = _mm_add_epi16(pix1,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705 pix1 = _mm_add_epi16(pix1,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2715 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2717 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2719 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2727 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729 tci = _mm_madd_epi16(tci, tcoffset);
2730 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2746 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747 tci = _mm_and_si128(tci, tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2761 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2773 int startx = span->startx;
2774 int endx = span->endx;
2779 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780 for (x = startx;x < endx;x++)
2783 c[0] = (data[0] + slope[0]*x) * z;
2784 c[1] = (data[1] + slope[1]*x) * z;
2785 c[2] = (data[2] + slope[2]*x) * z;
2786 c[3] = (data[3] + slope[3]*x) * z;
2787 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790 out4f[x*4+3] = in4f[x*4+3] * c[3];
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2797 int startx = span->startx;
2798 int endx = span->endx;
2803 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804 for (x = startx;x < endx;x++)
2807 c[0] = (data[0] + slope[0]*x) * z;
2808 c[1] = (data[1] + slope[1]*x) * z;
2809 c[2] = (data[2] + slope[2]*x) * z;
2810 c[3] = (data[3] + slope[3]*x) * z;
2811 out4f[x*4+0] = c[0];
2812 out4f[x*4+1] = c[1];
2813 out4f[x*4+2] = c[2];
2814 out4f[x*4+3] = c[3];
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2820 int x, startx = span->startx, endx = span->endx;
2821 float c[4], localcolor[4];
2822 localcolor[0] = subcolor[0];
2823 localcolor[1] = subcolor[1];
2824 localcolor[2] = subcolor[2];
2825 localcolor[3] = subcolor[3];
2826 for (x = startx;x < endx;x++)
2828 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2841 int x, startx = span->startx, endx = span->endx;
2842 for (x = startx;x < endx;x++)
2844 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2853 int x, startx = span->startx, endx = span->endx;
2854 for (x = startx;x < endx;x++)
2856 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 int x, startx = span->startx, endx = span->endx;
2867 for (x = startx;x < endx;x++)
2869 a = 1.0f - inb4f[x*4+3];
2871 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2880 int x, startx = span->startx, endx = span->endx;
2881 float localcolor[4], ilerp, lerp;
2882 localcolor[0] = color[0];
2883 localcolor[1] = color[1];
2884 localcolor[2] = color[2];
2885 localcolor[3] = color[3];
2886 ilerp = 1.0f - localcolor[3];
2887 lerp = localcolor[3];
2888 for (x = startx;x < endx;x++)
2890 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2903 int startx = span->startx;
2904 int endx = span->endx;
2907 __m128i submod, substep, endsubmod;
2908 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913 for (x = startx; x < endx;)
2915 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917 if (nextsub >= endx)
2919 nextsub = endsub = endx-1;
2920 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2924 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928 substep = _mm_packs_epi32(substep, substep);
2929 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2931 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932 pix = _mm_mulhi_epu16(pix, submod);
2933 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2937 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938 pix = _mm_mulhi_epu16(pix, submod);
2939 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2950 int startx = span->startx;
2951 int endx = span->endx;
2954 __m128i submod, substep, endsubmod;
2955 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960 for (x = startx; x < endx;)
2962 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964 if (nextsub >= endx)
2966 nextsub = endsub = endx-1;
2967 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2971 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975 substep = _mm_packs_epi32(substep, substep);
2976 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2978 __m128i pix = _mm_srai_epi16(submod, 4);
2979 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2983 __m128i pix = _mm_srai_epi16(submod, 4);
2984 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2994 int x, startx = span->startx, endx = span->endx;
2995 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996 localcolor = _mm_packs_epi32(localcolor, localcolor);
2997 for (x = startx;x+2 <= endx;x+=2)
2999 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3002 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3006 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3009 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3017 int x, startx = span->startx, endx = span->endx;
3018 for (x = startx;x+2 <= endx;x+=2)
3020 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3027 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3038 int x, startx = span->startx, endx = span->endx;
3039 for (x = startx;x+2 <= endx;x+=2)
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043 pix1 = _mm_add_epi16(pix1, pix2);
3044 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3048 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050 pix1 = _mm_add_epi16(pix1, pix2);
3051 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3059 int x, startx = span->startx, endx = span->endx;
3060 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061 tint = _mm_packs_epi32(tint, tint);
3062 for (x = startx;x+2 <= endx;x+=2)
3064 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3071 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3082 int x, startx = span->startx, endx = span->endx;
3083 for (x = startx;x+2 <= endx;x+=2)
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3105 int x, startx = span->startx, endx = span->endx;
3106 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107 localcolor = _mm_packs_epi32(localcolor, localcolor);
3108 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109 for (x = startx;x+2 <= endx;x+=2)
3111 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3117 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3126 void DPSOFTRAST_VertexShader_Generic(void)
3128 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3137 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3144 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3148 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3152 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3154 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3157 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3162 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3167 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3182 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3183 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3190 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3193 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3196 // TODO: implement saturation
3198 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3200 // TODO: implement gammaramps
3202 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3209 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3214 // this is never called (because colormask is off when this shader is used)
3215 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3226 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 unsigned char * RESTRICT pixelmask = span->pixelmask;
3234 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235 int x, startx = span->startx, endx = span->endx;
3236 __m128i Color_Ambientm;
3237 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243 pixel = buffer_FragColorbgra8;
3244 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248 for (x = startx;x < endx;x++)
3251 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3254 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3263 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3267 if (pixel == buffer_FragColorbgra8)
3268 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3276 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3284 unsigned char * RESTRICT pixelmask = span->pixelmask;
3285 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286 int x, startx = span->startx, endx = span->endx;
3287 __m128i Color_Ambientm, Color_Diffusem;
3289 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296 pixel = buffer_FragColorbgra8;
3297 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3312 __m128i color, mod, pix;
3313 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3316 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319 data = _mm_add_ps(data, slope);
3320 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321 data = _mm_add_ps(data, slope);
3322 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323 data = _mm_add_ps(data, slope);
3324 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3335 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3337 mod = _mm_packs_epi32(mod, mod);
3338 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3341 if (pixel == buffer_FragColorbgra8)
3342 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3350 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3358 unsigned char * RESTRICT pixelmask = span->pixelmask;
3359 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360 int x, startx = span->startx, endx = span->endx;
3361 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371 pixel = buffer_FragColorbgra8;
3372 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3381 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386 for (x = startx;x < endx;x++)
3388 __m128i color, lightmap, glow, pix;
3389 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3392 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3396 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3399 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3407 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3417 for (x = startx;x < endx;x++)
3419 __m128i color, lightmap, pix;
3420 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3423 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3426 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3435 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3441 if (pixel == buffer_FragColorbgra8)
3442 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3447 void DPSOFTRAST_VertexShader_LightDirection(void);
3448 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3450 void DPSOFTRAST_VertexShader_FakeLight(void)
3452 DPSOFTRAST_VertexShader_LightDirection();
3455 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3457 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3462 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3464 DPSOFTRAST_VertexShader_LightDirection();
3465 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3468 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3470 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3475 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3477 DPSOFTRAST_VertexShader_LightDirection();
3478 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3481 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3483 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3488 void DPSOFTRAST_VertexShader_LightDirection(void)
3491 int numvertices = dpsoftrast.numvertices;
3493 float LightVector[4];
3494 float EyePosition[4];
3495 float EyeVectorModelSpace[4];
3501 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3502 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3503 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3504 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3505 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3506 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3507 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3508 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3509 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3510 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3511 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3512 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3513 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3514 for (i = 0;i < numvertices;i++)
3516 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3517 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3518 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3519 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3520 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3521 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3522 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3523 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3524 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3525 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3526 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3527 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3528 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3529 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3530 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3531 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3532 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3533 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3534 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3535 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3536 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3537 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3538 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3539 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3540 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3541 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3542 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3546 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3549 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3550 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3551 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3552 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3553 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3554 #define DPSOFTRAST_Vector3Normalize(v)\
3557 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3568 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3570 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3571 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580 int x, startx = span->startx, endx = span->endx;
3581 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3582 float LightVectordata[4];
3583 float LightVectorslope[4];
3584 float EyeVectordata[4];
3585 float EyeVectorslope[4];
3586 float VectorSdata[4];
3587 float VectorSslope[4];
3588 float VectorTdata[4];
3589 float VectorTslope[4];
3590 float VectorRdata[4];
3591 float VectorRslope[4];
3593 float diffusetex[4];
3595 float surfacenormal[4];
3596 float lightnormal[4];
3597 float lightnormal_modelspace[4];
3599 float specularnormal[4];
3602 float SpecularPower;
3604 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3605 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3606 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3607 Color_Glow[3] = 0.0f;
3608 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3609 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3610 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3611 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3612 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3613 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3614 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3615 Color_Pants[3] = 0.0f;
3616 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3617 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3618 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3619 Color_Shirt[3] = 0.0f;
3620 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3621 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3622 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3624 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3625 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3629 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3631 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3633 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3634 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3635 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3636 Color_Diffuse[3] = 0.0f;
3637 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3638 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3639 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3640 LightColor[3] = 0.0f;
3641 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3642 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3643 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3644 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3645 Color_Specular[3] = 0.0f;
3646 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3647 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3648 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3650 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3652 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3653 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3654 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3655 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3656 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3658 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3660 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3661 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3663 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3665 // nothing of this needed
3669 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3672 for (x = startx;x < endx;x++)
3675 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3676 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3677 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3678 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3679 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3681 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3682 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3683 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3684 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3686 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3687 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3688 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3689 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3690 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3691 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3692 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3693 DPSOFTRAST_Vector3Normalize(surfacenormal);
3695 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3697 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3698 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3699 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3700 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3702 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3703 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3704 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3705 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3707 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3708 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3709 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3710 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3712 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3713 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3714 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3715 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3717 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3718 DPSOFTRAST_Vector3Normalize(lightnormal);
3720 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3722 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3723 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3724 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3725 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3728 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3730 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3731 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3732 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3734 float f = 1.0f / 256.0f;
3735 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3736 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3737 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3740 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3742 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3743 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3744 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3745 DPSOFTRAST_Vector3Normalize(eyenormal);
3747 LightColor[0] = 1.0;
3748 LightColor[1] = 1.0;
3749 LightColor[2] = 1.0;
3753 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3754 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3755 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3756 DPSOFTRAST_Vector3Normalize(lightnormal);
3759 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3760 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3761 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3762 DPSOFTRAST_Vector3Normalize(eyenormal);
3764 specularnormal[0] = lightnormal[0] + eyenormal[0];
3765 specularnormal[1] = lightnormal[1] + eyenormal[1];
3766 specularnormal[2] = lightnormal[2] + eyenormal[2];
3767 DPSOFTRAST_Vector3Normalize(specularnormal);
3769 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3770 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3771 specular = pow(specular, SpecularPower * glosstex[3]);
3772 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3774 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3775 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3776 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3777 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3781 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3782 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3783 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3784 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3787 buffer_FragColorbgra8[x*4+0] = d[0];
3788 buffer_FragColorbgra8[x*4+1] = d[1];
3789 buffer_FragColorbgra8[x*4+2] = d[2];
3790 buffer_FragColorbgra8[x*4+3] = d[3];
3793 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3795 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3796 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3797 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3798 Color_Diffuse[3] = 0.0f;
3799 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3800 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3801 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3802 LightColor[3] = 0.0f;
3803 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3805 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3807 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3808 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3809 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3810 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3811 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3813 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3815 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3816 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3818 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3820 // nothing of this needed
3824 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3827 for (x = startx;x < endx;x++)
3830 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3831 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3832 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3833 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3834 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3835 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3836 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3837 DPSOFTRAST_Vector3Normalize(surfacenormal);
3839 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3841 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3842 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3843 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3844 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3846 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3847 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3848 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3849 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3851 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3852 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3853 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3854 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3856 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3857 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3858 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3859 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3861 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3862 DPSOFTRAST_Vector3Normalize(lightnormal);
3864 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3866 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3867 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3868 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3869 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3872 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3874 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3875 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3876 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3878 float f = 1.0f / 256.0f;
3879 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3880 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3881 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3884 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3886 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3887 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3888 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3889 DPSOFTRAST_Vector3Normalize(eyenormal);
3891 LightColor[0] = 1.0;
3892 LightColor[1] = 1.0;
3893 LightColor[2] = 1.0;
3897 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3898 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3899 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3900 DPSOFTRAST_Vector3Normalize(lightnormal);
3903 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3904 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3906 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3907 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3908 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3909 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3913 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3914 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3915 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3916 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3918 buffer_FragColorbgra8[x*4+0] = d[0];
3919 buffer_FragColorbgra8[x*4+1] = d[1];
3920 buffer_FragColorbgra8[x*4+2] = d[2];
3921 buffer_FragColorbgra8[x*4+3] = d[3];
3926 for (x = startx;x < endx;x++)
3929 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3930 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3931 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3932 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3934 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3936 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3937 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3938 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3939 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3943 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3944 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3945 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3946 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3948 buffer_FragColorbgra8[x*4+0] = d[0];
3949 buffer_FragColorbgra8[x*4+1] = d[1];
3950 buffer_FragColorbgra8[x*4+2] = d[2];
3951 buffer_FragColorbgra8[x*4+3] = d[3];
3954 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3959 void DPSOFTRAST_VertexShader_LightSource(void)
3962 int numvertices = dpsoftrast.numvertices;
3963 float LightPosition[4];
3964 float LightVector[4];
3965 float LightVectorModelSpace[4];
3966 float EyePosition[4];
3967 float EyeVectorModelSpace[4];
3973 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3974 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3975 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3976 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3977 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3978 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3979 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3980 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3981 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3982 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3983 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3984 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3985 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3986 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3987 for (i = 0;i < numvertices;i++)
3989 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3990 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3991 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3992 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3993 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3994 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3995 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3996 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3997 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3998 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3999 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4000 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4001 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4002 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4003 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4004 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4005 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4006 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4007 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4008 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4009 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4010 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4011 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4012 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4013 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4014 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4015 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4016 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4017 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4018 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4019 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4020 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4022 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4023 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4026 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4029 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4030 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4031 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4032 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4033 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4034 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4035 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4036 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4037 int x, startx = span->startx, endx = span->endx;
4038 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4039 float CubeVectordata[4];
4040 float CubeVectorslope[4];
4041 float LightVectordata[4];
4042 float LightVectorslope[4];
4043 float EyeVectordata[4];
4044 float EyeVectorslope[4];
4046 float diffusetex[4];
4048 float surfacenormal[4];
4049 float lightnormal[4];
4051 float specularnormal[4];
4054 float SpecularPower;
4055 float CubeVector[4];
4058 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4059 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4060 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4061 Color_Glow[3] = 0.0f;
4062 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4063 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4064 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4065 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4066 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4067 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4068 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4069 Color_Diffuse[3] = 0.0f;
4070 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4071 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4072 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4073 Color_Specular[3] = 0.0f;
4074 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4075 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4076 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4077 Color_Pants[3] = 0.0f;
4078 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4079 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4080 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4081 Color_Shirt[3] = 0.0f;
4082 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4083 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4084 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4085 LightColor[3] = 0.0f;
4086 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4087 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4088 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4089 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4090 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4091 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4092 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4093 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4095 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4096 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4098 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4099 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4100 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4102 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4103 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4104 for (x = startx;x < endx;x++)
4107 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4108 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4109 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4110 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4111 if (attenuation < 0.01f)
4113 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4115 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4116 if (attenuation < 0.01f)
4120 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4121 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4122 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4123 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4124 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4126 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4127 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4128 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4129 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4131 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4132 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4133 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4134 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4135 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4136 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4137 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4138 DPSOFTRAST_Vector3Normalize(surfacenormal);
4140 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4141 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4142 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4143 DPSOFTRAST_Vector3Normalize(lightnormal);
4145 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4146 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4147 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4148 DPSOFTRAST_Vector3Normalize(eyenormal);
4150 specularnormal[0] = lightnormal[0] + eyenormal[0];
4151 specularnormal[1] = lightnormal[1] + eyenormal[1];
4152 specularnormal[2] = lightnormal[2] + eyenormal[2];
4153 DPSOFTRAST_Vector3Normalize(specularnormal);
4155 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4156 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4157 specular = pow(specular, SpecularPower * glosstex[3]);
4158 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4160 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4161 attenuation *= (1.0f / 255.0f);
4162 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4163 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4164 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4165 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4169 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4170 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4171 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4172 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4174 buffer_FragColorbgra8[x*4+0] = d[0];
4175 buffer_FragColorbgra8[x*4+1] = d[1];
4176 buffer_FragColorbgra8[x*4+2] = d[2];
4177 buffer_FragColorbgra8[x*4+3] = d[3];
4180 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4182 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4183 for (x = startx;x < endx;x++)
4186 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4187 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4188 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4189 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4190 if (attenuation < 0.01f)
4192 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4194 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4195 if (attenuation < 0.01f)
4199 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4200 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4201 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4202 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4203 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4205 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4206 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4207 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4208 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4210 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4211 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4212 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4213 DPSOFTRAST_Vector3Normalize(surfacenormal);
4215 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4216 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4217 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4218 DPSOFTRAST_Vector3Normalize(lightnormal);
4220 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4221 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4223 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4224 attenuation *= (1.0f / 255.0f);
4225 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4226 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4227 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4228 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4232 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4233 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4234 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4235 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4237 buffer_FragColorbgra8[x*4+0] = d[0];
4238 buffer_FragColorbgra8[x*4+1] = d[1];
4239 buffer_FragColorbgra8[x*4+2] = d[2];
4240 buffer_FragColorbgra8[x*4+3] = d[3];
4245 for (x = startx;x < endx;x++)
4248 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4249 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4250 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4251 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4252 if (attenuation < 0.01f)
4254 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4256 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4257 if (attenuation < 0.01f)
4261 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4262 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4263 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4264 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4265 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4267 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4268 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4269 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4270 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4272 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4274 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4275 attenuation *= (1.0f / 255.0f);
4276 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4277 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4278 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4279 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4283 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4284 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4285 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4286 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4288 buffer_FragColorbgra8[x*4+0] = d[0];
4289 buffer_FragColorbgra8[x*4+1] = d[1];
4290 buffer_FragColorbgra8[x*4+2] = d[2];
4291 buffer_FragColorbgra8[x*4+3] = d[3];
4294 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4300 void DPSOFTRAST_VertexShader_Refraction(void)
4302 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4305 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4308 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4309 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4310 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4311 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4312 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4317 void DPSOFTRAST_VertexShader_Water(void)
4319 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4323 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4326 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4327 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4328 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4329 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4330 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4335 void DPSOFTRAST_VertexShader_ShowDepth(void)
4337 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4340 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4343 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4344 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4345 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4346 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4347 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4352 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4354 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4357 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4360 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4361 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4362 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4363 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4364 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4369 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4371 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4374 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4377 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4378 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4379 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4380 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4381 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4386 typedef struct DPSOFTRAST_ShaderModeInfo_s
4389 void (*Vertex)(void);
4390 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4391 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4392 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4394 DPSOFTRAST_ShaderModeInfo;
4396 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4398 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4399 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4400 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4401 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4402 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4403 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4404 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4405 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4406 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4407 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4408 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4409 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4410 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4411 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4412 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4413 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4416 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4423 // unsigned int *colorpixel;
4424 unsigned int *depthpixel;
4430 DPSOFTRAST_State_Triangle *triangle;
4431 DPSOFTRAST_State_Span *span;
4432 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4433 for (i = 0; i < thread->numspans; i++)
4435 span = &thread->spans[i];
4436 triangle = &thread->triangles[span->triangle];
4437 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4439 wslope = triangle->w[0];
4440 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4441 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4442 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4443 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4444 startx = span->startx;
4446 switch(thread->fb_depthfunc)
4449 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4450 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4451 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4452 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4453 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4454 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4455 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4457 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4458 //for (x = startx;x < endx;x++)
4459 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4460 // if there is no color buffer, skip pixel shader
4461 while (startx < endx && !pixelmask[startx])
4463 while (endx > startx && !pixelmask[endx-1])
4466 continue; // no pixels to fill
4467 span->pixelmask = pixelmask;
4468 span->startx = startx;
4470 // run pixel shader if appropriate
4471 // do this before running depthmask code, to allow the pixelshader
4472 // to clear pixelmask values for alpha testing
4473 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4474 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4475 if (thread->depthmask)
4476 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4482 // no depth testing means we're just dealing with color...
4483 // if there is no color buffer, skip pixel shader
4484 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4486 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4487 span->pixelmask = pixelmask;
4488 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4492 thread->numspans = 0;
4495 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4497 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4500 int cullface = thread->cullface;
4501 int minx, maxx, miny, maxy;
4502 int miny1, maxy1, miny2, maxy2;
4503 __m128i fbmin, fbmax;
4504 __m128 viewportcenter, viewportscale;
4505 int firstvertex = command->firstvertex;
4506 int numvertices = command->numvertices;
4507 int numtriangles = command->numtriangles;
4508 const int *element3i = command->element3i;
4509 const unsigned short *element3s = command->element3s;
4510 int clipped = command->clipped;
4517 int starty, endy, bandy;
4521 __m128 triangleedge1, triangleedge2, trianglenormal;
4524 DPSOFTRAST_State_Triangle *triangle;
4525 DPSOFTRAST_Texture *texture;
4526 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4527 miny = thread->fb_scissor[1];
4528 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4529 miny1 = bound(miny, thread->miny1, maxy);
4530 maxy1 = bound(miny, thread->maxy1, maxy);
4531 miny2 = bound(miny, thread->miny2, maxy);
4532 maxy2 = bound(miny, thread->maxy2, maxy);
4533 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4535 if (!ATOMIC_DECREMENT(command->refcount))
4537 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4538 MM_FREE(command->arrays);
4542 minx = thread->fb_scissor[0];
4543 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4544 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4545 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4546 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4547 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4548 screen[3] = _mm_setzero_ps();
4549 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4550 for (i = 0;i < numtriangles;i++)
4552 const float *screencoord4f = command->arrays;
4553 const float *arrays = screencoord4f + numvertices*4;
4555 // generate the 3 edges of this triangle
4556 // generate spans for the triangle - switch based on left split or right split classification of triangle
4559 e[0] = element3s[i*3+0] - firstvertex;
4560 e[1] = element3s[i*3+1] - firstvertex;
4561 e[2] = element3s[i*3+2] - firstvertex;
4565 e[0] = element3i[i*3+0] - firstvertex;
4566 e[1] = element3i[i*3+1] - firstvertex;
4567 e[2] = element3i[i*3+2] - firstvertex;
4576 #define SKIPBACKFACE \
4577 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4578 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4579 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4580 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4581 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4585 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4589 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4594 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4595 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4597 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4598 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4600 #define CLIPPEDVERTEXCOPY(k,p1) \
4601 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4603 #define GENATTRIBCOPY(attrib, p1) \
4604 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4605 #define GENATTRIBLERP(attrib, p1, p2) \
4607 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4608 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4610 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4614 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4615 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4616 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4617 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4618 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4619 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4620 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4626 // calculate distance from nearplane
4627 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4628 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4629 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4630 if (clipdist[0] >= 0.0f)
4632 if (clipdist[1] >= 0.0f)
4634 if (clipdist[2] >= 0.0f)
4637 // triangle is entirely in front of nearplane
4638 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4645 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4653 if (clipdist[2] >= 0.0f)
4655 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4662 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4669 else if (clipdist[1] >= 0.0f)
4671 if (clipdist[2] >= 0.0f)
4673 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4680 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4686 else if (clipdist[2] >= 0.0f)
4688 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4693 else continue; // triangle is entirely behind nearplane
4696 // calculate integer y coords for triangle points
4697 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4698 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4699 screenmin = _mm_min_epi16(screeni, screenir),
4700 screenmax = _mm_max_epi16(screeni, screenir);
4701 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4702 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4703 screenmin = _mm_max_epi16(screenmin, fbmin);
4704 screenmax = _mm_min_epi16(screenmax, fbmax);
4705 // skip offscreen triangles
4706 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4708 starty = _mm_extract_epi16(screenmin, 1);
4709 endy = _mm_extract_epi16(screenmax, 1)+1;
4710 if (starty >= maxy1 && endy <= miny2)
4712 screeny = _mm_srai_epi32(screeni, 16);
4715 triangle = &thread->triangles[thread->numtriangles];
4717 // calculate attribute plans for triangle data...
4718 // okay, this triangle is going to produce spans, we'd better project
4719 // the interpolants now (this is what gives perspective texturing),
4720 // this consists of simply multiplying all arrays by the W coord
4721 // (which is basically 1/Z), which will be undone per-pixel
4722 // (multiplying by Z again) to get the perspective-correct array
4725 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4726 __m128 mipedgescale, mipdensity;
4727 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4728 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4729 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4730 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4731 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4732 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4733 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4734 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4735 attribedge1 = _mm_sub_ss(w0, w1);
4736 attribedge2 = _mm_sub_ss(w2, w1);
4737 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4738 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4739 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4740 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4741 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4742 _mm_store_ss(&triangle->w[0], attribxslope);
4743 _mm_store_ss(&triangle->w[1], attribyslope);
4744 _mm_store_ss(&triangle->w[2], attriborigin);
4745 mipedgescale = _mm_setzero_ps();
4746 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4748 __m128 attrib0, attrib1, attrib2;
4749 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4750 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4752 arrays += numvertices*4;
4753 GENATTRIBS(attrib0, attrib1, attrib2);
4754 attriborigin = _mm_mul_ps(attrib1, w1);
4755 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4756 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4757 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4758 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4759 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4760 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4761 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4762 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4763 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4765 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4766 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4767 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4768 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4772 memset(triangle->mip, 0, sizeof(triangle->mip));
4773 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4775 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4776 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4778 texture = thread->texbound[texunit];
4779 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4781 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4782 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4783 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4784 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4785 // this will be multiplied in the texturing routine by the texture resolution
4786 y = _mm_cvtss_si32(mipdensity);
4789 y = (int)(log((float)y)*0.5f/M_LN2);
4790 if (y > texture->mipmaps - 1)
4791 y = texture->mipmaps - 1;
4792 triangle->mip[texunit] = y;
4798 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4801 __m128 xcoords, xslope;
4802 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4803 int yccmask = _mm_movemask_epi8(ycc);
4804 int edge0p, edge0n, edge1p, edge1n;
4811 case 0xFFFF: /*0000*/ y = endy; continue;
4812 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4813 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4814 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4815 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4816 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4817 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4818 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4819 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4820 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4821 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4822 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4823 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4824 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4825 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4826 case 0x0000: /*1111*/ y++; continue;
4834 case 0xFFFF: /*000*/ y = endy; continue;
4835 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4836 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4837 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4838 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4839 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4840 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4841 case 0x0000: /*111*/ y++; continue;
4844 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4845 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4846 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4847 nexty = _mm_extract_epi16(ycc, 0);
4848 if (nexty >= bandy) nexty = bandy-1;
4849 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4850 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4851 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4852 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4853 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4854 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4856 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4857 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4859 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4861 int startx, endx, offset;
4862 startx = _mm_cvtss_si32(xcoords);
4863 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4866 if (startx < 0) startx = 0;
4867 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4869 if (endx > maxx) endx = maxx;
4870 if (startx >= endx) continue;
4871 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4873 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4874 span->triangle = thread->numtriangles;
4877 span->startx = max(minx - offset, 0);
4878 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4879 if (span->startx >= span->endx)
4881 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4882 DPSOFTRAST_Draw_ProcessSpans(thread);
4887 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4889 DPSOFTRAST_Draw_ProcessSpans(thread);
4890 thread->numtriangles = 0;
4894 if (!ATOMIC_DECREMENT(command->refcount))
4896 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4897 MM_FREE(command->arrays);
4900 if (thread->numspans > 0 || thread->numtriangles > 0)
4902 DPSOFTRAST_Draw_ProcessSpans(thread);
4903 thread->numtriangles = 0;
4908 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4912 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4913 int datasize = 2*numvertices*sizeof(float[4]);
4914 DPSOFTRAST_Command_Draw *command;
4915 unsigned char *data;
4916 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4918 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4919 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4921 datasize += numvertices*sizeof(float[4]);
4924 datasize += numtriangles*sizeof(unsigned short[3]);
4926 datasize += numtriangles*sizeof(int[3]);
4927 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4928 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4930 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4931 data = (unsigned char *)MM_CALLOC(datasize, 1);
4935 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4936 data = (unsigned char *)command + commandsize;
4938 command->firstvertex = firstvertex;
4939 command->numvertices = numvertices;
4940 command->numtriangles = numtriangles;
4941 command->arrays = (float *)data;
4942 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4943 dpsoftrast.firstvertex = firstvertex;
4944 dpsoftrast.numvertices = numvertices;
4945 dpsoftrast.screencoord4f = (float *)data;
4946 data += numvertices*sizeof(float[4]);
4947 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4948 data += numvertices*sizeof(float[4]);
4949 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4951 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4952 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4954 dpsoftrast.post_array4f[j] = (float *)data;
4955 data += numvertices*sizeof(float[4]);
4957 command->element3i = NULL;
4958 command->element3s = NULL;
4961 command->element3s = (unsigned short *)data;
4962 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4966 command->element3i = (int *)data;
4967 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4972 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4974 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4975 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4976 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4977 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4978 if (command->starty >= command->endy)
4980 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4981 MM_FREE(command->arrays);
4982 DPSOFTRAST_UndoCommand(command->commandsize);
4985 command->clipped = dpsoftrast.drawclipped;
4986 command->refcount = dpsoftrast.numthreads;
4988 if (dpsoftrast.usethreads)
4991 DPSOFTRAST_Draw_SyncCommands();
4992 for (i = 0; i < dpsoftrast.numthreads; i++)
4994 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4995 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4996 Thread_CondSignal(thread->drawcond);
5001 DPSOFTRAST_Draw_FlushThreads();
5005 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5007 int commandoffset = thread->commandoffset;
5008 while (commandoffset != endoffset)
5010 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5011 switch (command->opcode)
5013 #define INTERPCOMMAND(name) \
5014 case DPSOFTRAST_OPCODE_##name : \
5015 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5016 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5017 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5018 commandoffset = 0; \
5020 INTERPCOMMAND(Viewport)
5021 INTERPCOMMAND(ClearColor)
5022 INTERPCOMMAND(ClearDepth)
5023 INTERPCOMMAND(ColorMask)
5024 INTERPCOMMAND(DepthTest)
5025 INTERPCOMMAND(ScissorTest)
5026 INTERPCOMMAND(Scissor)
5027 INTERPCOMMAND(BlendFunc)
5028 INTERPCOMMAND(BlendSubtract)
5029 INTERPCOMMAND(DepthMask)
5030 INTERPCOMMAND(DepthFunc)
5031 INTERPCOMMAND(DepthRange)
5032 INTERPCOMMAND(PolygonOffset)
5033 INTERPCOMMAND(CullFace)
5034 INTERPCOMMAND(AlphaTest)
5035 INTERPCOMMAND(AlphaFunc)
5036 INTERPCOMMAND(SetTexture)
5037 INTERPCOMMAND(SetShader)
5038 INTERPCOMMAND(Uniform4f)
5039 INTERPCOMMAND(UniformMatrix4f)
5040 INTERPCOMMAND(Uniform1i)
5042 case DPSOFTRAST_OPCODE_Draw:
5043 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5044 commandoffset += command->commandsize;
5045 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5047 thread->commandoffset = commandoffset;
5050 case DPSOFTRAST_OPCODE_Reset:
5055 thread->commandoffset = commandoffset;
5058 static int DPSOFTRAST_Draw_Thread(void *data)
5060 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5061 while(thread->index >= 0)
5063 if (thread->commandoffset != dpsoftrast.drawcommand)
5065 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5069 Thread_LockMutex(thread->drawmutex);
5070 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5072 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5073 thread->starving = true;
5074 Thread_CondWait(thread->drawcond, thread->drawmutex);
5075 thread->starving = false;
5077 Thread_UnlockMutex(thread->drawmutex);
5083 static void DPSOFTRAST_Draw_FlushThreads(void)
5085 DPSOFTRAST_State_Thread *thread;
5087 DPSOFTRAST_Draw_SyncCommands();
5088 if (dpsoftrast.usethreads)
5090 for (i = 0; i < dpsoftrast.numthreads; i++)
5092 thread = &dpsoftrast.threads[i];
5093 if (thread->commandoffset != dpsoftrast.drawcommand)
5095 Thread_LockMutex(thread->drawmutex);
5096 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5097 Thread_CondSignal(thread->drawcond);
5098 Thread_UnlockMutex(thread->drawmutex);
5101 for (i = 0; i < dpsoftrast.numthreads; i++)
5103 thread = &dpsoftrast.threads[i];
5104 if (thread->commandoffset != dpsoftrast.drawcommand)
5106 Thread_LockMutex(thread->drawmutex);
5107 if (thread->commandoffset != dpsoftrast.drawcommand)
5109 thread->waiting = true;
5110 Thread_CondWait(thread->waitcond, thread->drawmutex);
5111 thread->waiting = false;
5113 Thread_UnlockMutex(thread->drawmutex);
5119 for (i = 0; i < dpsoftrast.numthreads; i++)
5121 thread = &dpsoftrast.threads[i];
5122 if (thread->commandoffset != dpsoftrast.drawcommand)
5123 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5126 dpsoftrast.commandpool.usedcommands = 0;
5129 void DPSOFTRAST_Flush(void)
5131 DPSOFTRAST_Draw_FlushThreads();
5134 void DPSOFTRAST_Finish(void)
5139 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5149 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5150 dpsoftrast.bigendian = u.b[3];
5151 dpsoftrast.fb_width = width;
5152 dpsoftrast.fb_height = height;
5153 dpsoftrast.fb_depthpixels = depthpixels;
5154 dpsoftrast.fb_colorpixels[0] = colorpixels;
5155 dpsoftrast.fb_colorpixels[1] = NULL;
5156 dpsoftrast.fb_colorpixels[1] = NULL;
5157 dpsoftrast.fb_colorpixels[1] = NULL;
5158 dpsoftrast.viewport[0] = 0;
5159 dpsoftrast.viewport[1] = 0;
5160 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5161 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5162 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5163 dpsoftrast.texture_firstfree = 1;
5164 dpsoftrast.texture_end = 1;
5165 dpsoftrast.texture_max = 0;
5166 dpsoftrast.color[0] = 1;
5167 dpsoftrast.color[1] = 1;
5168 dpsoftrast.color[2] = 1;
5169 dpsoftrast.color[3] = 1;
5170 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5171 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5172 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5173 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5174 for (i = 0; i < dpsoftrast.numthreads; i++)
5176 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5178 thread->cullface = GL_BACK;
5179 thread->colormask[1] = 1;
5180 thread->colormask[2] = 1;
5181 thread->colormask[3] = 1;
5182 thread->blendfunc[0] = GL_ONE;
5183 thread->blendfunc[1] = GL_ZERO;
5184 thread->depthmask = true;
5185 thread->depthtest = true;
5186 thread->depthfunc = GL_LEQUAL;
5187 thread->scissortest = false;
5188 thread->alphatest = false;
5189 thread->alphafunc = GL_GREATER;
5190 thread->alphavalue = 0.5f;
5191 thread->viewport[0] = 0;
5192 thread->viewport[1] = 0;
5193 thread->viewport[2] = dpsoftrast.fb_width;
5194 thread->viewport[3] = dpsoftrast.fb_height;
5195 thread->scissor[0] = 0;
5196 thread->scissor[1] = 0;
5197 thread->scissor[2] = dpsoftrast.fb_width;
5198 thread->scissor[3] = dpsoftrast.fb_height;
5199 thread->depthrange[0] = 0;
5200 thread->depthrange[1] = 1;
5201 thread->polygonoffset[0] = 0;
5202 thread->polygonoffset[1] = 0;
5204 if (dpsoftrast.interlace)
5206 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5207 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5208 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5209 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5213 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5214 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5217 thread->numspans = 0;
5218 thread->numtriangles = 0;
5219 thread->commandoffset = 0;
5220 thread->waiting = false;
5221 thread->starving = false;
5223 thread->validate = -1;
5224 DPSOFTRAST_Validate(thread, -1);
5226 if (dpsoftrast.usethreads)
5228 thread->waitcond = Thread_CreateCond();
5229 thread->drawcond = Thread_CreateCond();
5230 thread->drawmutex = Thread_CreateMutex();
5231 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5237 void DPSOFTRAST_Shutdown(void)
5240 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5242 DPSOFTRAST_State_Thread *thread;
5243 for (i = 0; i < dpsoftrast.numthreads; i++)
5245 thread = &dpsoftrast.threads[i];
5246 Thread_LockMutex(thread->drawmutex);
5248 Thread_CondSignal(thread->drawcond);
5249 Thread_UnlockMutex(thread->drawmutex);
5250 Thread_WaitThread(thread->thread, 0);
5251 Thread_DestroyCond(thread->waitcond);
5252 Thread_DestroyCond(thread->drawcond);
5253 Thread_DestroyMutex(thread->drawmutex);
5256 for (i = 0;i < dpsoftrast.texture_end;i++)
5257 if (dpsoftrast.texture[i].bytes)
5258 MM_FREE(dpsoftrast.texture[i].bytes);
5259 if (dpsoftrast.texture)
5260 free(dpsoftrast.texture);
5261 if (dpsoftrast.threads)
5262 MM_FREE(dpsoftrast.threads);
5263 memset(&dpsoftrast, 0, sizeof(dpsoftrast));