3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
237 int shader_exactspecularmath;
239 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
241 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
244 // DPSOFTRAST_VALIDATE_ flags
247 // derived values (DPSOFTRAST_VALIDATE_FB)
250 ALIGN(float fb_viewportcenter[4]);
251 ALIGN(float fb_viewportscale[4]);
253 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
256 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
265 ATOMIC(volatile int commandoffset);
267 volatile bool waiting;
268 volatile bool starving;
275 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
278 DPSOFTRAST_State_Thread);
280 typedef ATOMIC(struct DPSOFTRAST_State_s
284 unsigned int *fb_depthpixels;
285 unsigned int *fb_colorpixels[4];
288 ALIGN(float fb_viewportcenter[4]);
289 ALIGN(float fb_viewportscale[4]);
292 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
295 const float *pointer_vertex3f;
296 const float *pointer_color4f;
297 const unsigned char *pointer_color4ub;
298 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
307 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308 float *screencoord4f;
314 int shader_permutation;
315 int shader_exactspecularmath;
319 int texture_firstfree;
320 DPSOFTRAST_Texture *texture;
325 const char *errorstring;
330 DPSOFTRAST_State_Thread *threads;
332 ATOMIC(volatile int drawcommand);
334 DPSOFTRAST_State_Command_Pool commandpool;
338 DPSOFTRAST_State dpsoftrast;
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
348 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350 fb_viewportcenter[3] = 0.5f;
351 fb_viewportcenter[0] = 0.0f;
352 fb_viewportscale[1] = 0.5f * viewport[2];
353 fb_viewportscale[2] = -0.5f * viewport[3];
354 fb_viewportscale[3] = 0.5f;
355 fb_viewportscale[0] = 1.0f;
358 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
360 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
361 // and viewport projection values
364 x1 = thread->scissor[0];
365 x2 = thread->scissor[0] + thread->scissor[2];
366 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
367 y2 = dpsoftrast.fb_height - thread->scissor[1];
368 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
370 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
372 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
373 thread->fb_scissor[0] = x1;
374 thread->fb_scissor[1] = y1;
375 thread->fb_scissor[2] = x2 - x1;
376 thread->fb_scissor[3] = y2 - y1;
378 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
381 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
383 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
386 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
388 if (thread->blendsubtract)
390 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
392 #define BLENDFUNC(sfactor, dfactor, blendmode) \
393 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
394 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
395 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
400 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
402 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
403 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
404 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
405 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
406 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
407 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
408 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
409 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
410 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
411 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
412 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
419 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
421 mask &= thread->validate;
424 if (mask & DPSOFTRAST_VALIDATE_FB)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
427 DPSOFTRAST_RecalcFB(thread);
429 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
432 DPSOFTRAST_RecalcDepthFunc(thread);
434 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
436 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
437 DPSOFTRAST_RecalcBlendFunc(thread);
441 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
443 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
444 return &dpsoftrast.texture[index];
448 static void DPSOFTRAST_Texture_Grow(void)
450 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
451 DPSOFTRAST_State_Thread *thread;
455 // expand texture array as needed
456 if (dpsoftrast.texture_max < 1024)
457 dpsoftrast.texture_max = 1024;
459 dpsoftrast.texture_max *= 2;
460 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
461 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
462 if (dpsoftrast.texbound[i])
463 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
464 for (j = 0; j < dpsoftrast.numthreads; j++)
466 thread = &dpsoftrast.threads[j];
467 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
468 if (thread->texbound[i])
469 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
473 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
482 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
483 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
484 DPSOFTRAST_Texture *texture;
485 if (width*height*depth < 1)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
490 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
492 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
497 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
498 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
501 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
502 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
512 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
534 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
539 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
544 // find first empty slot in texture array
545 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
546 if (!dpsoftrast.texture[texnum].bytes)
548 dpsoftrast.texture_firstfree = texnum + 1;
549 if (dpsoftrast.texture_max <= texnum)
550 DPSOFTRAST_Texture_Grow();
551 if (dpsoftrast.texture_end <= texnum)
552 dpsoftrast.texture_end = texnum + 1;
553 texture = &dpsoftrast.texture[texnum];
554 memset(texture, 0, sizeof(*texture));
555 texture->flags = flags;
556 texture->width = width;
557 texture->height = height;
558 texture->depth = depth;
559 texture->sides = sides;
571 s = w * h * d * sides * 4;
572 texture->mipmap[mipmaps][0] = size;
573 texture->mipmap[mipmaps][1] = s;
574 texture->mipmap[mipmaps][2] = w;
575 texture->mipmap[mipmaps][3] = h;
576 texture->mipmap[mipmaps][4] = d;
579 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 texture->mipmaps = mipmaps;
586 texture->size = size;
588 // allocate the pixels now
589 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
593 void DPSOFTRAST_Texture_Free(int index)
595 DPSOFTRAST_Texture *texture;
596 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
600 MM_FREE(texture->bytes);
601 texture->bytes = NULL;
602 memset(texture, 0, sizeof(*texture));
603 // adjust the free range and used range
604 if (dpsoftrast.texture_firstfree > index)
605 dpsoftrast.texture_firstfree = index;
606 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
607 dpsoftrast.texture_end--;
609 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
611 int i, x, y, z, w, layer0, layer1, row0, row1;
612 unsigned char *o, *i0, *i1, *i2, *i3;
613 DPSOFTRAST_Texture *texture;
614 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
615 if (texture->mipmaps <= 1)
617 for (i = 1;i < texture->mipmaps;i++)
619 for (z = 0;z < texture->mipmap[i][4];z++)
623 if (layer1 >= texture->mipmap[i-1][4])
624 layer1 = texture->mipmap[i-1][4]-1;
625 for (y = 0;y < texture->mipmap[i][3];y++)
629 if (row1 >= texture->mipmap[i-1][3])
630 row1 = texture->mipmap[i-1][3]-1;
631 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
632 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
633 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
634 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
635 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
636 w = texture->mipmap[i][2];
639 if (texture->mipmap[i-1][2] > 1)
641 // average 3D texture
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
644 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
645 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
646 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
647 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
652 // average 3D mipmap with parent width == 1
653 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
655 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
656 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
657 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
658 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
664 if (texture->mipmap[i-1][2] > 1)
666 // average 2D texture (common case)
667 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
669 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
670 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
671 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
672 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
677 // 2D texture with parent width == 1
678 o[0] = (i0[0] + i1[0] + 1) >> 1;
679 o[1] = (i0[1] + i1[1] + 1) >> 1;
680 o[2] = (i0[2] + i1[2] + 1) >> 1;
681 o[3] = (i0[3] + i1[3] + 1) >> 1;
688 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
690 DPSOFTRAST_Texture *texture;
692 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
695 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
696 while (blockheight > 0)
698 memcpy(dst, pixels, blockwidth * 4);
699 pixels += blockwidth * 4;
700 dst += texture->mipmap[0][2] * 4;
703 DPSOFTRAST_Texture_CalculateMipmaps(index);
705 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
707 DPSOFTRAST_Texture *texture;
708 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
711 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
712 DPSOFTRAST_Texture_CalculateMipmaps(index);
714 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
716 DPSOFTRAST_Texture *texture;
717 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718 return texture->mipmap[mip][2];
720 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
722 DPSOFTRAST_Texture *texture;
723 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
724 return texture->mipmap[mip][3];
726 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
728 DPSOFTRAST_Texture *texture;
729 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
730 return texture->mipmap[mip][4];
732 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
734 DPSOFTRAST_Texture *texture;
735 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
738 return texture->bytes + texture->mipmap[mip][0];
740 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
742 DPSOFTRAST_Texture *texture;
743 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
746 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
751 texture->filter = filter;
754 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
756 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
757 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
758 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
760 dpsoftrast.fb_width = width;
761 dpsoftrast.fb_height = height;
762 dpsoftrast.fb_depthpixels = depthpixels;
763 dpsoftrast.fb_colorpixels[0] = colorpixels0;
764 dpsoftrast.fb_colorpixels[1] = colorpixels1;
765 dpsoftrast.fb_colorpixels[2] = colorpixels2;
766 dpsoftrast.fb_colorpixels[3] = colorpixels3;
769 static void DPSOFTRAST_Draw_FlushThreads(void);
771 static void DPSOFTRAST_Draw_SyncCommands(void)
773 if(dpsoftrast.usethreads) MEMORY_BARRIER;
774 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
777 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
779 DPSOFTRAST_State_Thread *thread;
781 int freecommand = dpsoftrast.commandpool.freecommand;
782 int usedcommands = dpsoftrast.commandpool.usedcommands;
783 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
785 DPSOFTRAST_Draw_SyncCommands();
791 for (i = 0; i < dpsoftrast.numthreads; i++)
793 thread = &dpsoftrast.threads[i];
794 commandoffset = freecommand - thread->commandoffset;
795 if (commandoffset < 0)
796 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
797 if (commandoffset > usedcommands)
800 usedcommands = commandoffset;
803 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
805 thread = &dpsoftrast.threads[waitindex];
806 Thread_LockMutex(thread->drawmutex);
807 if (thread->commandoffset != dpsoftrast.drawcommand)
809 thread->waiting = true;
810 if (thread->starving) Thread_CondSignal(thread->drawcond);
811 Thread_CondWait(thread->waitcond, thread->drawmutex);
812 thread->waiting = false;
814 Thread_UnlockMutex(thread->drawmutex);
816 dpsoftrast.commandpool.usedcommands = usedcommands;
819 #define DPSOFTRAST_ALIGNCOMMAND(size) \
820 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
821 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
822 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
824 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
826 DPSOFTRAST_Command *command;
827 int freecommand = dpsoftrast.commandpool.freecommand;
828 int usedcommands = dpsoftrast.commandpool.usedcommands;
829 int extra = sizeof(DPSOFTRAST_Command);
830 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
832 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
834 if (dpsoftrast.usethreads)
835 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
837 DPSOFTRAST_Draw_FlushThreads();
838 freecommand = dpsoftrast.commandpool.freecommand;
839 usedcommands = dpsoftrast.commandpool.usedcommands;
841 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
843 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
844 command->opcode = DPSOFTRAST_OPCODE_Reset;
845 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
848 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
849 command->opcode = opcode;
850 command->commandsize = size;
852 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
854 dpsoftrast.commandpool.freecommand = freecommand;
855 dpsoftrast.commandpool.usedcommands = usedcommands + size;
859 static void DPSOFTRAST_UndoCommand(int size)
861 int freecommand = dpsoftrast.commandpool.freecommand;
862 int usedcommands = dpsoftrast.commandpool.usedcommands;
865 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
866 usedcommands -= size;
867 dpsoftrast.commandpool.freecommand = freecommand;
868 dpsoftrast.commandpool.usedcommands = usedcommands;
871 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
872 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
874 thread->viewport[0] = command->x;
875 thread->viewport[1] = command->y;
876 thread->viewport[2] = command->width;
877 thread->viewport[3] = command->height;
878 thread->validate |= DPSOFTRAST_VALIDATE_FB;
880 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
882 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
885 command->width = width;
886 command->height = height;
888 dpsoftrast.viewport[0] = x;
889 dpsoftrast.viewport[1] = y;
890 dpsoftrast.viewport[2] = width;
891 dpsoftrast.viewport[3] = height;
892 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
895 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
896 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
898 int i, x1, y1, x2, y2, w, h, x, y;
899 int miny1 = thread->miny1;
900 int maxy1 = thread->maxy1;
901 int miny2 = thread->miny2;
902 int maxy2 = thread->maxy2;
906 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
907 x1 = thread->fb_scissor[0];
908 y1 = thread->fb_scissor[1];
909 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
910 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
911 if (y1 < miny1) y1 = miny1;
912 if (y2 > maxy2) y2 = maxy2;
917 // FIXME: honor fb_colormask?
918 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
919 for (i = 0;i < 4;i++)
921 if (!dpsoftrast.fb_colorpixels[i])
923 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
926 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
927 for (x = x1;x < x2;x++)
932 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
934 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
941 DEFCOMMAND(3, ClearDepth, float depth;)
942 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
944 int x1, y1, x2, y2, w, h, x, y;
945 int miny1 = thread->miny1;
946 int maxy1 = thread->maxy1;
947 int miny2 = thread->miny2;
948 int maxy2 = thread->maxy2;
952 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953 x1 = thread->fb_scissor[0];
954 y1 = thread->fb_scissor[1];
955 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
956 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
957 if (y1 < miny1) y1 = miny1;
958 if (y2 > maxy2) y2 = maxy2;
963 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
964 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
967 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
968 for (x = x1;x < x2;x++)
972 void DPSOFTRAST_ClearDepth(float d)
974 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
978 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
979 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
981 thread->colormask[0] = command->r != 0;
982 thread->colormask[1] = command->g != 0;
983 thread->colormask[2] = command->b != 0;
984 thread->colormask[3] = command->a != 0;
985 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
987 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
989 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
996 DEFCOMMAND(5, DepthTest, int enable;)
997 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
999 thread->depthtest = command->enable;
1000 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1002 void DPSOFTRAST_DepthTest(int enable)
1004 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1005 command->enable = enable;
1008 DEFCOMMAND(6, ScissorTest, int enable;)
1009 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1011 thread->scissortest = command->enable;
1012 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1014 void DPSOFTRAST_ScissorTest(int enable)
1016 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1017 command->enable = enable;
1020 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1021 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1023 thread->scissor[0] = command->x;
1024 thread->scissor[1] = command->y;
1025 thread->scissor[2] = command->width;
1026 thread->scissor[3] = command->height;
1027 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1029 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1031 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1034 command->width = width;
1035 command->height = height;
1038 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1039 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1041 thread->blendfunc[0] = command->sfactor;
1042 thread->blendfunc[1] = command->dfactor;
1043 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1045 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1047 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1048 command->sfactor = sfactor;
1049 command->dfactor = dfactor;
1052 DEFCOMMAND(9, BlendSubtract, int enable;)
1053 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1055 thread->blendsubtract = command->enable;
1056 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1058 void DPSOFTRAST_BlendSubtract(int enable)
1060 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1061 command->enable = enable;
1064 DEFCOMMAND(10, DepthMask, int enable;)
1065 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1067 thread->depthmask = command->enable;
1069 void DPSOFTRAST_DepthMask(int enable)
1071 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1072 command->enable = enable;
1075 DEFCOMMAND(11, DepthFunc, int func;)
1076 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1078 thread->depthfunc = command->func;
1080 void DPSOFTRAST_DepthFunc(int func)
1082 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1083 command->func = func;
1086 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1087 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1089 thread->depthrange[0] = command->nearval;
1090 thread->depthrange[1] = command->farval;
1092 void DPSOFTRAST_DepthRange(float nearval, float farval)
1094 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1095 command->nearval = nearval;
1096 command->farval = farval;
1099 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1100 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1102 thread->polygonoffset[0] = command->alongnormal;
1103 thread->polygonoffset[1] = command->intoview;
1105 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1107 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1108 command->alongnormal = alongnormal;
1109 command->intoview = intoview;
1112 DEFCOMMAND(14, CullFace, int mode;)
1113 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1115 thread->cullface = command->mode;
1117 void DPSOFTRAST_CullFace(int mode)
1119 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1120 command->mode = mode;
1123 DEFCOMMAND(15, AlphaTest, int enable;)
1124 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1126 thread->alphatest = command->enable;
1128 void DPSOFTRAST_AlphaTest(int enable)
1130 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1131 command->enable = enable;
1134 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1135 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1137 thread->alphafunc = command->func;
1138 thread->alphavalue = command->ref;
1140 void DPSOFTRAST_AlphaFunc(int func, float ref)
1142 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1143 command->func = func;
1147 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1149 dpsoftrast.color[0] = r;
1150 dpsoftrast.color[1] = g;
1151 dpsoftrast.color[2] = b;
1152 dpsoftrast.color[3] = a;
1155 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1157 int outstride = blockwidth * 4;
1158 int instride = dpsoftrast.fb_width * 4;
1161 int bx2 = blockx + blockwidth;
1162 int by2 = blocky + blockheight;
1166 unsigned char *inpixels;
1170 if (bx1 < 0) bx1 = 0;
1171 if (by1 < 0) by1 = 0;
1172 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1173 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1175 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1176 if (dpsoftrast.bigendian)
1178 for (y = by1;y < by2;y++)
1180 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1181 o = (unsigned char *)outpixels + (y - by1) * outstride;
1182 for (x = bx1;x < bx2;x++)
1195 for (y = by1;y < by2;y++)
1197 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1198 o = (unsigned char *)outpixels + (y - by1) * outstride;
1204 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1208 int tx2 = tx + width;
1209 int ty2 = ty + height;
1212 int sx2 = sx + width;
1213 int sy2 = sy + height;
1223 unsigned int *spixels;
1224 unsigned int *tpixels;
1225 DPSOFTRAST_Texture *texture;
1226 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1227 if (mip < 0 || mip >= texture->mipmaps) return;
1229 spixels = dpsoftrast.fb_colorpixels[0];
1230 swidth = dpsoftrast.fb_width;
1231 sheight = dpsoftrast.fb_height;
1232 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1233 twidth = texture->mipmap[mip][2];
1234 theight = texture->mipmap[mip][3];
1235 if (tx1 < 0) tx1 = 0;
1236 if (ty1 < 0) ty1 = 0;
1237 if (tx2 > twidth) tx2 = twidth;
1238 if (ty2 > theight) ty2 = theight;
1239 if (sx1 < 0) sx1 = 0;
1240 if (sy1 < 0) sy1 = 0;
1241 if (sx2 > swidth) sx2 = swidth;
1242 if (sy2 > sheight) sy2 = sheight;
1247 if (tw > sw) tw = sw;
1248 if (th > sh) th = sh;
1249 if (tw < 1 || th < 1)
1251 sy1 = sheight - 1 - sy1;
1252 for (y = 0;y < th;y++)
1253 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1254 if (texture->mipmaps > 1)
1255 DPSOFTRAST_Texture_CalculateMipmaps(index);
1258 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1259 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1261 if (thread->texbound[command->unitnum])
1262 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1263 thread->texbound[command->unitnum] = command->texture;
1265 void DPSOFTRAST_SetTexture(int unitnum, int index)
1267 DPSOFTRAST_Command_SetTexture *command;
1268 DPSOFTRAST_Texture *texture;
1269 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1271 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1274 texture = DPSOFTRAST_Texture_GetByIndex(index);
1275 if (index && !texture)
1277 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1281 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1282 command->unitnum = unitnum;
1283 command->texture = texture;
1285 dpsoftrast.texbound[unitnum] = texture;
1286 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1289 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1291 dpsoftrast.pointer_vertex3f = vertex3f;
1292 dpsoftrast.stride_vertex = stride;
1294 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1296 dpsoftrast.pointer_color4f = color4f;
1297 dpsoftrast.pointer_color4ub = NULL;
1298 dpsoftrast.stride_color = stride;
1300 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1302 dpsoftrast.pointer_color4f = NULL;
1303 dpsoftrast.pointer_color4ub = color4ub;
1304 dpsoftrast.stride_color = stride;
1306 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1308 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1309 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1310 dpsoftrast.stride_texcoord[unitnum] = stride;
1313 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1314 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1316 thread->shader_mode = command->mode;
1317 thread->shader_permutation = command->permutation;
1318 thread->shader_exactspecularmath = command->exactspecularmath;
1320 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1322 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1323 command->mode = mode;
1324 command->permutation = permutation;
1325 command->exactspecularmath = exactspecularmath;
1327 dpsoftrast.shader_mode = mode;
1328 dpsoftrast.shader_permutation = permutation;
1329 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1332 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1333 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1335 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1337 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1339 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1340 command->index = index;
1341 command->val[0] = v0;
1342 command->val[1] = v1;
1343 command->val[2] = v2;
1344 command->val[3] = v3;
1346 dpsoftrast.uniform4f[index*4+0] = v0;
1347 dpsoftrast.uniform4f[index*4+1] = v1;
1348 dpsoftrast.uniform4f[index*4+2] = v2;
1349 dpsoftrast.uniform4f[index*4+3] = v3;
1351 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1353 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1354 command->index = index;
1355 memcpy(command->val, v, sizeof(command->val));
1357 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1360 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1361 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1363 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1365 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1369 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1371 __m128 m0, m1, m2, m3;
1372 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1373 command->index = (DPSOFTRAST_UNIFORM)index;
1374 if (((size_t)v)&(ALIGN_SIZE-1))
1376 m0 = _mm_loadu_ps(v);
1377 m1 = _mm_loadu_ps(v+4);
1378 m2 = _mm_loadu_ps(v+8);
1379 m3 = _mm_loadu_ps(v+12);
1383 m0 = _mm_load_ps(v);
1384 m1 = _mm_load_ps(v+4);
1385 m2 = _mm_load_ps(v+8);
1386 m3 = _mm_load_ps(v+12);
1390 __m128 t0, t1, t2, t3;
1391 t0 = _mm_unpacklo_ps(m0, m1);
1392 t1 = _mm_unpacklo_ps(m2, m3);
1393 t2 = _mm_unpackhi_ps(m0, m1);
1394 t3 = _mm_unpackhi_ps(m2, m3);
1395 m0 = _mm_movelh_ps(t0, t1);
1396 m1 = _mm_movehl_ps(t1, t0);
1397 m2 = _mm_movelh_ps(t2, t3);
1398 m3 = _mm_movehl_ps(t3, t2);
1400 _mm_store_ps(command->val, m0);
1401 _mm_store_ps(command->val+4, m1);
1402 _mm_store_ps(command->val+8, m2);
1403 _mm_store_ps(command->val+12, m3);
1404 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1405 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1406 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1407 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1412 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1413 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1415 thread->uniform1i[command->index] = command->val;
1417 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1419 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1420 command->index = index;
1423 dpsoftrast.uniform1i[command->index] = i0;
1427 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1429 float *end = dst + size*4;
1430 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1434 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1443 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1450 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1452 float *end = dst + size*4;
1453 if (stride == sizeof(float[3]))
1455 float *end4 = dst + (size&~3)*4;
1456 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1460 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1461 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1462 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1465 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1466 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1468 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1469 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1474 src += 4*sizeof(float[3]);
1481 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1482 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1483 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1486 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1487 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1489 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1490 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1493 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1495 src += 4*sizeof(float[3]);
1499 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1503 __m128 v = _mm_loadu_ps((const float *)src);
1504 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507 _mm_store_ps(dst, v);
1516 __m128 v = _mm_load_ps((const float *)src);
1517 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1518 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1519 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1520 _mm_store_ps(dst, v);
1527 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1529 float *end = dst + size*4;
1530 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1531 if (stride == sizeof(float[2]))
1533 float *end2 = dst + (size&~1)*4;
1534 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1538 __m128 v = _mm_loadu_ps((const float *)src);
1539 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1540 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1542 src += 2*sizeof(float[2]);
1549 __m128 v = _mm_load_ps((const float *)src);
1550 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1551 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553 src += 2*sizeof(float[2]);
1559 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1565 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1567 float *end = dst + size*4;
1568 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1569 if (stride == sizeof(unsigned char[4]))
1571 float *end4 = dst + (size&~3)*4;
1572 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1576 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1582 src += 4*sizeof(unsigned char[4]);
1589 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1590 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1593 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1595 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1608 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1610 float *end = dst + 4*size;
1611 __m128 v = _mm_loadu_ps(src);
1614 _mm_store_ps(dst, v);
1620 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1623 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1624 __m128 m0, m1, m2, m3;
1626 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1628 // fast case for identity matrix
1629 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1632 end = out4f + numitems*4;
1633 m0 = _mm_loadu_ps(inmatrix16f);
1634 m1 = _mm_loadu_ps(inmatrix16f + 4);
1635 m2 = _mm_loadu_ps(inmatrix16f + 8);
1636 m3 = _mm_loadu_ps(inmatrix16f + 12);
1637 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1641 __m128 v = _mm_loadu_ps(in4f);
1643 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1644 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1645 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1646 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655 __m128 v = _mm_load_ps(in4f);
1657 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1658 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1659 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1660 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1668 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1670 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1674 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1676 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1677 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1678 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1679 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1682 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1684 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1690 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1693 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1695 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1696 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1699 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1701 int clipmask = 0xFF;
1702 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1703 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1704 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1705 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1706 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1707 #define BBFRONT(k, pos) \
1709 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1710 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1714 clipmask &= ~(1<<k); \
1715 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1716 minproj = _mm_min_ss(minproj, proj); \
1717 maxproj = _mm_max_ss(maxproj, proj); \
1721 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1722 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1723 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1724 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1725 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1726 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1730 if (clipmask&(1<<k)) \
1732 if (!(clipmask&(1<<(k^1)))) \
1734 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1735 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1736 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1737 minproj = _mm_min_ss(minproj, proj); \
1738 maxproj = _mm_max_ss(maxproj, proj); \
1740 if (!(clipmask&(1<<(k^2)))) \
1742 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1743 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1744 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1745 minproj = _mm_min_ss(minproj, proj); \
1746 maxproj = _mm_max_ss(maxproj, proj); \
1748 if (!(clipmask&(1<<(k^4)))) \
1750 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1751 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1752 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1753 minproj = _mm_min_ss(minproj, proj); \
1754 maxproj = _mm_max_ss(maxproj, proj); \
1758 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1759 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1760 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1761 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1762 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1763 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1764 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1765 *starty = _mm_cvttss_si32(maxproj);
1766 *endy = _mm_cvttss_si32(minproj)+1;
1770 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1772 float *end = out4f + numitems*4;
1773 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1774 __m128 minpos, maxpos;
1775 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1777 minpos = maxpos = _mm_loadu_ps(in4f);
1780 __m128 v = _mm_loadu_ps(in4f);
1781 minpos = _mm_min_ps(minpos, v);
1782 maxpos = _mm_max_ps(maxpos, v);
1783 _mm_store_ps(out4f, v);
1784 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1785 _mm_store_ps(screen4f, v);
1793 minpos = maxpos = _mm_load_ps(in4f);
1796 __m128 v = _mm_load_ps(in4f);
1797 minpos = _mm_min_ps(minpos, v);
1798 maxpos = _mm_max_ps(maxpos, v);
1799 _mm_store_ps(out4f, v);
1800 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801 _mm_store_ps(screen4f, v);
1808 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1809 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1810 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1811 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1812 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1816 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1818 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1819 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1821 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1822 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1823 end = out4f + numitems*4;
1824 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1825 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1826 m0 = _mm_loadu_ps(inmatrix16f);
1827 m1 = _mm_loadu_ps(inmatrix16f + 4);
1828 m2 = _mm_loadu_ps(inmatrix16f + 8);
1829 m3 = _mm_loadu_ps(inmatrix16f + 12);
1830 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1832 minpos = maxpos = _mm_loadu_ps(in4f);
1835 __m128 v = _mm_loadu_ps(in4f);
1836 minpos = _mm_min_ps(minpos, v);
1837 maxpos = _mm_max_ps(maxpos, v);
1838 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1839 _mm_store_ps(out4f, v);
1840 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1841 _mm_store_ps(screen4f, v);
1849 minpos = maxpos = _mm_load_ps(in4f);
1852 __m128 v = _mm_load_ps(in4f);
1853 minpos = _mm_min_ps(minpos, v);
1854 maxpos = _mm_max_ps(maxpos, v);
1855 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1856 _mm_store_ps(out4f, v);
1857 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1858 _mm_store_ps(screen4f, v);
1865 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1870 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1873 float *outf = dpsoftrast.post_array4f[outarray];
1874 const unsigned char *inb;
1875 int firstvertex = dpsoftrast.firstvertex;
1876 int numvertices = dpsoftrast.numvertices;
1880 case DPSOFTRAST_ARRAY_POSITION:
1881 stride = dpsoftrast.stride_vertex;
1882 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1883 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1885 case DPSOFTRAST_ARRAY_COLOR:
1886 stride = dpsoftrast.stride_color;
1887 if (dpsoftrast.pointer_color4f)
1889 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1890 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1892 else if (dpsoftrast.pointer_color4ub)
1894 stride = dpsoftrast.stride_color;
1895 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1896 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1900 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1904 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1905 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1907 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1908 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1911 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1914 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1917 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1929 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1931 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1932 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1937 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1940 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1941 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1949 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1952 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1953 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1960 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1963 int startx = span->startx;
1964 int endx = span->endx;
1965 float wslope = triangle->w[0];
1966 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1967 float endz = 1.0f / (w + wslope * startx);
1968 for (x = startx;x < endx;)
1970 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1972 if (nextsub >= endx) nextsub = endsub = endx-1;
1973 endz = 1.0f / (w + wslope * nextsub);
1974 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1975 for (; x <= endsub; x++, z += dz)
1980 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1983 int startx = span->startx;
1984 int endx = span->endx;
1987 unsigned char * RESTRICT pixelmask = span->pixelmask;
1988 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1991 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1992 // handle alphatest now (this affects depth writes too)
1993 if (thread->alphatest)
1994 for (x = startx;x < endx;x++)
1995 if (in4f[x*4+3] < 0.5f)
1996 pixelmask[x] = false;
1997 // FIXME: this does not handle bigendian
1998 switch(thread->fb_blendmode)
2000 case DPSOFTRAST_BLENDMODE_OPAQUE:
2001 for (x = startx;x < endx;x++)
2005 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2006 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2007 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2008 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2009 pixel[x*4+0] = d[0];
2010 pixel[x*4+1] = d[1];
2011 pixel[x*4+2] = d[2];
2012 pixel[x*4+3] = d[3];
2015 case DPSOFTRAST_BLENDMODE_ALPHA:
2016 for (x = startx;x < endx;x++)
2020 a = in4f[x*4+3] * 255.0f;
2021 b = 1.0f - in4f[x*4+3];
2022 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2023 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2024 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2025 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2026 pixel[x*4+0] = d[0];
2027 pixel[x*4+1] = d[1];
2028 pixel[x*4+2] = d[2];
2029 pixel[x*4+3] = d[3];
2032 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2033 for (x = startx;x < endx;x++)
2037 a = in4f[x*4+3] * 255.0f;
2038 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042 pixel[x*4+0] = d[0];
2043 pixel[x*4+1] = d[1];
2044 pixel[x*4+2] = d[2];
2045 pixel[x*4+3] = d[3];
2048 case DPSOFTRAST_BLENDMODE_ADD:
2049 for (x = startx;x < endx;x++)
2053 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_INVMOD:
2064 for (x = startx;x < endx;x++)
2068 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072 pixel[x*4+0] = d[0];
2073 pixel[x*4+1] = d[1];
2074 pixel[x*4+2] = d[2];
2075 pixel[x*4+3] = d[3];
2078 case DPSOFTRAST_BLENDMODE_MUL:
2079 for (x = startx;x < endx;x++)
2083 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2084 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2085 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2086 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2087 pixel[x*4+0] = d[0];
2088 pixel[x*4+1] = d[1];
2089 pixel[x*4+2] = d[2];
2090 pixel[x*4+3] = d[3];
2093 case DPSOFTRAST_BLENDMODE_MUL2:
2094 for (x = startx;x < endx;x++)
2098 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2099 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2100 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2101 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2102 pixel[x*4+0] = d[0];
2103 pixel[x*4+1] = d[1];
2104 pixel[x*4+2] = d[2];
2105 pixel[x*4+3] = d[3];
2108 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2109 for (x = startx;x < endx;x++)
2113 a = in4f[x*4+3] * -255.0f;
2114 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2115 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2116 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2117 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2118 pixel[x*4+0] = d[0];
2119 pixel[x*4+1] = d[1];
2120 pixel[x*4+2] = d[2];
2121 pixel[x*4+3] = d[3];
2124 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2125 for (x = startx;x < endx;x++)
2130 b = 1.0f - in4f[x*4+3];
2131 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2132 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2133 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2134 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2135 pixel[x*4+0] = d[0];
2136 pixel[x*4+1] = d[1];
2137 pixel[x*4+2] = d[2];
2138 pixel[x*4+3] = d[3];
2141 case DPSOFTRAST_BLENDMODE_INVADD:
2142 for (x = startx;x < endx;x++)
2146 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2147 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2148 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2149 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2150 pixel[x*4+0] = d[0];
2151 pixel[x*4+1] = d[1];
2152 pixel[x*4+2] = d[2];
2153 pixel[x*4+3] = d[3];
2159 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2163 int startx = span->startx;
2164 int endx = span->endx;
2165 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2166 unsigned char * RESTRICT pixelmask = span->pixelmask;
2167 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2168 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2171 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2172 pixeli += span->y * dpsoftrast.fb_width + span->x;
2173 // handle alphatest now (this affects depth writes too)
2174 if (thread->alphatest)
2175 for (x = startx;x < endx;x++)
2176 if (in4ub[x*4+3] < 0.5f)
2177 pixelmask[x] = false;
2178 // FIXME: this does not handle bigendian
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 for (x = startx;x + 4 <= endx;)
2184 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2186 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2200 case DPSOFTRAST_BLENDMODE_ALPHA:
2201 #define FINISHBLEND(blend2, blend1) \
2202 for (x = startx;x + 1 < endx;x += 2) \
2205 switch (*(const unsigned short*)&pixelmask[x]) \
2208 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2209 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2211 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2214 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2215 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2217 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2228 for(;x < endx; x++) \
2231 if (!pixelmask[x]) \
2233 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2234 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2236 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2240 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2244 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2247 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2249 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2250 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2253 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2256 case DPSOFTRAST_BLENDMODE_ADD:
2257 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2259 case DPSOFTRAST_BLENDMODE_INVMOD:
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2266 case DPSOFTRAST_BLENDMODE_MUL:
2267 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2269 case DPSOFTRAST_BLENDMODE_MUL2:
2270 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2272 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2274 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2275 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2278 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2281 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2283 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2284 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2287 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2290 case DPSOFTRAST_BLENDMODE_INVADD:
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2294 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2301 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2304 int startx = span->startx;
2305 int endx = span->endx;
2310 float tc[2], endtc[2];
2312 unsigned int tci[2];
2313 unsigned int tci1[2];
2314 unsigned int tcimin[2];
2315 unsigned int tcimax[2];
2320 const unsigned char * RESTRICT pixelbase;
2321 const unsigned char * RESTRICT pixel[4];
2322 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2323 // if no texture is bound, just fill it with white
2326 for (x = startx;x < endx;x++)
2328 out4f[x*4+0] = 1.0f;
2329 out4f[x*4+1] = 1.0f;
2330 out4f[x*4+2] = 1.0f;
2331 out4f[x*4+3] = 1.0f;
2335 mip = triangle->mip[texunitindex];
2336 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2337 // if this mipmap of the texture is 1 pixel, just fill it with that color
2338 if (texture->mipmap[mip][1] == 4)
2340 c[0] = texture->bytes[2] * (1.0f/255.0f);
2341 c[1] = texture->bytes[1] * (1.0f/255.0f);
2342 c[2] = texture->bytes[0] * (1.0f/255.0f);
2343 c[3] = texture->bytes[3] * (1.0f/255.0f);
2344 for (x = startx;x < endx;x++)
2346 out4f[x*4+0] = c[0];
2347 out4f[x*4+1] = c[1];
2348 out4f[x*4+2] = c[2];
2349 out4f[x*4+3] = c[3];
2353 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2354 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2355 flags = texture->flags;
2356 tcscale[0] = texture->mipmap[mip][2];
2357 tcscale[1] = texture->mipmap[mip][3];
2358 tciwidth = texture->mipmap[mip][2];
2361 tcimax[0] = texture->mipmap[mip][2]-1;
2362 tcimax[1] = texture->mipmap[mip][3]-1;
2363 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2364 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2365 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2366 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2367 for (x = startx;x < endx;)
2369 unsigned int subtc[2];
2370 unsigned int substep[2];
2371 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2372 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2373 if (nextsub >= endx)
2375 nextsub = endsub = endx-1;
2376 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2380 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2381 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2382 substep[0] = (endtc[0] - tc[0]) * subscale;
2383 substep[1] = (endtc[1] - tc[1]) * subscale;
2384 subtc[0] = tc[0] * (1<<16);
2385 subtc[1] = tc[1] * (1<<16);
2388 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2390 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2392 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2393 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2394 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2395 tci[0] = subtc[0]>>16;
2396 tci[1] = subtc[1]>>16;
2397 tci1[0] = tci[0] + 1;
2398 tci1[1] = tci[1] + 1;
2399 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2400 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2401 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2402 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2403 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2404 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2405 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2406 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2407 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2408 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2409 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2410 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2411 out4f[x*4+0] = c[0];
2412 out4f[x*4+1] = c[1];
2413 out4f[x*4+2] = c[2];
2414 out4f[x*4+3] = c[3];
2419 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2421 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2422 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2423 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2424 tci[0] = subtc[0]>>16;
2425 tci[1] = subtc[1]>>16;
2426 tci1[0] = tci[0] + 1;
2427 tci1[1] = tci[1] + 1;
2428 tci[0] &= tciwrapmask[0];
2429 tci[1] &= tciwrapmask[1];
2430 tci1[0] &= tciwrapmask[0];
2431 tci1[1] &= tciwrapmask[1];
2432 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2433 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2434 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2435 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2436 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2437 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2438 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2439 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2440 out4f[x*4+0] = c[0];
2441 out4f[x*4+1] = c[1];
2442 out4f[x*4+2] = c[2];
2443 out4f[x*4+3] = c[3];
2447 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2449 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2451 tci[0] = subtc[0]>>16;
2452 tci[1] = subtc[1]>>16;
2453 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2454 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2455 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2456 c[0] = pixel[0][2] * (1.0f / 255.0f);
2457 c[1] = pixel[0][1] * (1.0f / 255.0f);
2458 c[2] = pixel[0][0] * (1.0f / 255.0f);
2459 c[3] = pixel[0][3] * (1.0f / 255.0f);
2460 out4f[x*4+0] = c[0];
2461 out4f[x*4+1] = c[1];
2462 out4f[x*4+2] = c[2];
2463 out4f[x*4+3] = c[3];
2468 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2470 tci[0] = subtc[0]>>16;
2471 tci[1] = subtc[1]>>16;
2472 tci[0] &= tciwrapmask[0];
2473 tci[1] &= tciwrapmask[1];
2474 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2475 c[0] = pixel[0][2] * (1.0f / 255.0f);
2476 c[1] = pixel[0][1] * (1.0f / 255.0f);
2477 c[2] = pixel[0][0] * (1.0f / 255.0f);
2478 c[3] = pixel[0][3] * (1.0f / 255.0f);
2479 out4f[x*4+0] = c[0];
2480 out4f[x*4+1] = c[1];
2481 out4f[x*4+2] = c[2];
2482 out4f[x*4+3] = c[3];
2488 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2492 int startx = span->startx;
2493 int endx = span->endx;
2495 __m128 data, slope, tcscale;
2496 __m128i tcsize, tcmask, tcoffset, tcmax;
2498 __m128i subtc, substep, endsubtc;
2501 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2502 const unsigned char * RESTRICT pixelbase;
2503 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2504 // if no texture is bound, just fill it with white
2507 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2510 mip = triangle->mip[texunitindex];
2511 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2512 // if this mipmap of the texture is 1 pixel, just fill it with that color
2513 if (texture->mipmap[mip][1] == 4)
2515 unsigned int k = *((const unsigned int *)pixelbase);
2516 for (x = startx;x < endx;x++)
2520 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2521 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2522 flags = texture->flags;
2523 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2524 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2525 tcscale = _mm_cvtepi32_ps(tcsize);
2526 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2527 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2528 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2529 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2530 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2531 tcmax = _mm_packs_epi32(tcmask, tcmask);
2532 for (x = startx;x < endx;)
2534 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2535 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2536 if (nextsub >= endx)
2538 nextsub = endsub = endx-1;
2539 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2543 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2544 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2545 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2546 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2547 substep = _mm_slli_epi32(substep, 1);
2550 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2551 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2553 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2554 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2556 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2557 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2558 tci = _mm_madd_epi16(tci, tcoffset);
2559 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2560 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2561 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2562 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2563 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2564 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2565 fracm = _mm_srli_epi16(subtc, 1);
2566 pix1 = _mm_add_epi16(pix1,
2567 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2568 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2569 pix3 = _mm_add_epi16(pix3,
2570 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2571 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2572 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2573 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2574 pix2 = _mm_add_epi16(pix2,
2575 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2576 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2577 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2581 const unsigned char * RESTRICT ptr1;
2582 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2583 tci = _mm_madd_epi16(tci, tcoffset);
2584 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2585 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2586 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2587 fracm = _mm_srli_epi16(subtc, 1);
2588 pix1 = _mm_add_epi16(pix1,
2589 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2590 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2591 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2592 pix1 = _mm_add_epi16(pix1,
2593 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2594 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2595 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2599 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2601 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2603 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2604 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2605 tci = _mm_madd_epi16(tci, tcoffset);
2606 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2607 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2608 _mm_setzero_si128());
2609 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2610 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2611 _mm_setzero_si128());
2612 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2613 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2614 tci = _mm_madd_epi16(tci, tcoffset);
2615 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2616 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2617 _mm_setzero_si128());
2618 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2619 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2620 _mm_setzero_si128());
2621 fracm = _mm_srli_epi16(subtc, 1);
2622 pix1 = _mm_add_epi16(pix1,
2623 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2624 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2625 pix3 = _mm_add_epi16(pix3,
2626 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2627 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2628 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2629 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2630 pix2 = _mm_add_epi16(pix2,
2631 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2632 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2633 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2637 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2638 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2639 tci = _mm_madd_epi16(tci, tcoffset);
2640 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2641 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2642 _mm_setzero_si128());
2643 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2644 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2645 _mm_setzero_si128());
2646 fracm = _mm_srli_epi16(subtc, 1);
2647 pix1 = _mm_add_epi16(pix1,
2648 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2649 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2650 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2651 pix1 = _mm_add_epi16(pix1,
2652 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2653 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2654 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2660 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2662 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2663 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2664 tci = _mm_madd_epi16(tci, tcoffset);
2665 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2666 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2667 _mm_setzero_si128());
2668 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2669 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2670 _mm_setzero_si128());
2671 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2672 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2673 tci = _mm_madd_epi16(tci, tcoffset);
2674 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2675 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2676 _mm_setzero_si128());
2677 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2678 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2679 _mm_setzero_si128());
2680 fracm = _mm_srli_epi16(subtc, 1);
2681 pix1 = _mm_add_epi16(pix1,
2682 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2683 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2684 pix3 = _mm_add_epi16(pix3,
2685 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2686 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2687 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2688 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2689 pix2 = _mm_add_epi16(pix2,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2691 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2692 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2696 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2697 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2698 tci = _mm_madd_epi16(tci, tcoffset);
2699 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2700 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2701 _mm_setzero_si128());
2702 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2703 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2704 _mm_setzero_si128());
2705 fracm = _mm_srli_epi16(subtc, 1);
2706 pix1 = _mm_add_epi16(pix1,
2707 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2708 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2709 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2710 pix1 = _mm_add_epi16(pix1,
2711 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2712 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2713 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2720 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2722 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2724 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2725 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2726 tci = _mm_madd_epi16(tci, tcoffset);
2727 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2732 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2733 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2734 tci = _mm_madd_epi16(tci, tcoffset);
2735 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2743 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2744 tci = _mm_and_si128(tci, tcmax);
2745 tci = _mm_madd_epi16(tci, tcoffset);
2746 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2747 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2751 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2752 tci = _mm_and_si128(tci, tcmax);
2753 tci = _mm_madd_epi16(tci, tcoffset);
2754 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2763 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2766 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2769 float DPSOFTRAST_SampleShadowmap(const float *vector)
2775 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2778 int startx = span->startx;
2779 int endx = span->endx;
2784 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2785 for (x = startx;x < endx;x++)
2788 c[0] = (data[0] + slope[0]*x) * z;
2789 c[1] = (data[1] + slope[1]*x) * z;
2790 c[2] = (data[2] + slope[2]*x) * z;
2791 c[3] = (data[3] + slope[3]*x) * z;
2792 out4f[x*4+0] = in4f[x*4+0] * c[0];
2793 out4f[x*4+1] = in4f[x*4+1] * c[1];
2794 out4f[x*4+2] = in4f[x*4+2] * c[2];
2795 out4f[x*4+3] = in4f[x*4+3] * c[3];
2799 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2802 int startx = span->startx;
2803 int endx = span->endx;
2808 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2809 for (x = startx;x < endx;x++)
2812 c[0] = (data[0] + slope[0]*x) * z;
2813 c[1] = (data[1] + slope[1]*x) * z;
2814 c[2] = (data[2] + slope[2]*x) * z;
2815 c[3] = (data[3] + slope[3]*x) * z;
2816 out4f[x*4+0] = c[0];
2817 out4f[x*4+1] = c[1];
2818 out4f[x*4+2] = c[2];
2819 out4f[x*4+3] = c[3];
2823 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2825 int x, startx = span->startx, endx = span->endx;
2826 float c[4], localcolor[4];
2827 localcolor[0] = subcolor[0];
2828 localcolor[1] = subcolor[1];
2829 localcolor[2] = subcolor[2];
2830 localcolor[3] = subcolor[3];
2831 for (x = startx;x < endx;x++)
2833 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2834 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2835 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2836 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2837 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2838 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2839 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2840 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2844 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2846 int x, startx = span->startx, endx = span->endx;
2847 for (x = startx;x < endx;x++)
2849 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2850 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2851 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2852 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2856 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2858 int x, startx = span->startx, endx = span->endx;
2859 for (x = startx;x < endx;x++)
2861 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2862 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2863 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2864 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2868 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2870 int x, startx = span->startx, endx = span->endx;
2872 for (x = startx;x < endx;x++)
2874 a = 1.0f - inb4f[x*4+3];
2876 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2877 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2878 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2879 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2883 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2885 int x, startx = span->startx, endx = span->endx;
2886 float localcolor[4], ilerp, lerp;
2887 localcolor[0] = color[0];
2888 localcolor[1] = color[1];
2889 localcolor[2] = color[2];
2890 localcolor[3] = color[3];
2891 ilerp = 1.0f - localcolor[3];
2892 lerp = localcolor[3];
2893 for (x = startx;x < endx;x++)
2895 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2896 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2897 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2898 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2904 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2908 int startx = span->startx;
2909 int endx = span->endx;
2912 __m128i submod, substep, endsubmod;
2913 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2914 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2915 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2916 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2917 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2918 for (x = startx; x < endx;)
2920 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2921 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2922 if (nextsub >= endx)
2924 nextsub = endsub = endx-1;
2925 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2929 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2930 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2931 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2932 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2933 substep = _mm_packs_epi32(substep, substep);
2934 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2936 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2937 pix = _mm_mulhi_epu16(pix, submod);
2938 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2942 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2943 pix = _mm_mulhi_epu16(pix, submod);
2944 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2951 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2955 int startx = span->startx;
2956 int endx = span->endx;
2959 __m128i submod, substep, endsubmod;
2960 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2961 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2962 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2963 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2964 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2965 for (x = startx; x < endx;)
2967 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2968 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2969 if (nextsub >= endx)
2971 nextsub = endsub = endx-1;
2972 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2976 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2977 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2978 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2979 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2980 substep = _mm_packs_epi32(substep, substep);
2981 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2983 __m128i pix = _mm_srai_epi16(submod, 4);
2984 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2988 __m128i pix = _mm_srai_epi16(submod, 4);
2989 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2996 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2999 int x, startx = span->startx, endx = span->endx;
3000 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3001 localcolor = _mm_packs_epi32(localcolor, localcolor);
3002 for (x = startx;x+2 <= endx;x+=2)
3004 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3005 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3006 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3007 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3011 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3012 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3013 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3014 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3019 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3022 int x, startx = span->startx, endx = span->endx;
3023 for (x = startx;x+2 <= endx;x+=2)
3025 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3026 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3027 pix1 = _mm_mulhi_epu16(pix1, pix2);
3028 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3032 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3033 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3034 pix1 = _mm_mulhi_epu16(pix1, pix2);
3035 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3040 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3043 int x, startx = span->startx, endx = span->endx;
3044 for (x = startx;x+2 <= endx;x+=2)
3046 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3047 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3048 pix1 = _mm_add_epi16(pix1, pix2);
3049 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3053 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3054 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3055 pix1 = _mm_add_epi16(pix1, pix2);
3056 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3061 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3064 int x, startx = span->startx, endx = span->endx;
3065 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3066 tint = _mm_packs_epi32(tint, tint);
3067 for (x = startx;x+2 <= endx;x+=2)
3069 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3070 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3071 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3072 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3076 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3077 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3078 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3079 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3084 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3087 int x, startx = span->startx, endx = span->endx;
3088 for (x = startx;x+2 <= endx;x+=2)
3090 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3091 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3092 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3093 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3094 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3098 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3099 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3100 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3101 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3102 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3107 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3110 int x, startx = span->startx, endx = span->endx;
3111 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3112 localcolor = _mm_packs_epi32(localcolor, localcolor);
3113 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3114 for (x = startx;x+2 <= endx;x+=2)
3116 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3117 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3122 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3123 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3124 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3131 void DPSOFTRAST_VertexShader_Generic(void)
3133 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3134 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3135 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3136 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3137 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3140 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3142 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3143 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3144 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3145 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3146 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3147 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3149 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3150 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3151 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3153 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3154 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3157 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3162 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3164 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3167 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3172 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3173 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3178 void DPSOFTRAST_VertexShader_PostProcess(void)
3180 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3181 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3182 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3185 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3187 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3188 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3189 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3192 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3193 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3195 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3196 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3198 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3199 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3201 // TODO: implement saturation
3203 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3205 // TODO: implement gammaramps
3207 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3212 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3214 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3219 // this is never called (because colormask is off when this shader is used)
3220 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3221 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3222 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3223 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3224 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3229 void DPSOFTRAST_VertexShader_FlatColor(void)
3231 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3232 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3235 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3238 unsigned char * RESTRICT pixelmask = span->pixelmask;
3239 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3240 int x, startx = span->startx, endx = span->endx;
3241 __m128i Color_Ambientm;
3242 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3243 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3244 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3245 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3246 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3247 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3248 pixel = buffer_FragColorbgra8;
3249 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3250 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3251 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3252 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3253 for (x = startx;x < endx;x++)
3256 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3259 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3260 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3261 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3262 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3268 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3269 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3270 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3272 if (pixel == buffer_FragColorbgra8)
3273 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3279 void DPSOFTRAST_VertexShader_VertexColor(void)
3281 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3282 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3283 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3286 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3289 unsigned char * RESTRICT pixelmask = span->pixelmask;
3290 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3291 int x, startx = span->startx, endx = span->endx;
3292 __m128i Color_Ambientm, Color_Diffusem;
3294 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3295 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3296 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3297 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3298 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3299 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3300 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3301 pixel = buffer_FragColorbgra8;
3302 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3303 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3304 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3305 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3306 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3309 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3310 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3311 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3312 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3313 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3314 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3315 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3317 __m128i color, mod, pix;
3318 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3321 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3322 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3323 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3324 data = _mm_add_ps(data, slope);
3325 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3326 data = _mm_add_ps(data, slope);
3327 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3328 data = _mm_add_ps(data, slope);
3329 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3330 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3331 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3332 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3333 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3334 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3340 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3341 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3342 mod = _mm_packs_epi32(mod, mod);
3343 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3344 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3346 if (pixel == buffer_FragColorbgra8)
3347 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3353 void DPSOFTRAST_VertexShader_Lightmap(void)
3355 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3356 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3357 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3360 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3363 unsigned char * RESTRICT pixelmask = span->pixelmask;
3364 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3365 int x, startx = span->startx, endx = span->endx;
3366 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3367 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3368 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3369 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3370 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3371 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3372 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3373 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3375 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3376 pixel = buffer_FragColorbgra8;
3377 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3378 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3379 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3380 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3381 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3384 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3386 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3387 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3388 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3389 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3390 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3391 for (x = startx;x < endx;x++)
3393 __m128i color, lightmap, glow, pix;
3394 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3397 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3398 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3399 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3400 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3401 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3402 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3403 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3404 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3405 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3406 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3412 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3413 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3414 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3415 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3416 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3417 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3422 for (x = startx;x < endx;x++)
3424 __m128i color, lightmap, pix;
3425 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3428 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3429 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3430 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3431 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3432 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3433 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3434 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3440 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3441 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3442 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3443 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3446 if (pixel == buffer_FragColorbgra8)
3447 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3452 void DPSOFTRAST_VertexShader_LightDirection(void);
3453 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3455 void DPSOFTRAST_VertexShader_FakeLight(void)
3457 DPSOFTRAST_VertexShader_LightDirection();
3460 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3462 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3467 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3469 DPSOFTRAST_VertexShader_LightDirection();
3470 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3475 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3480 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3482 DPSOFTRAST_VertexShader_LightDirection();
3483 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3486 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3488 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3493 void DPSOFTRAST_VertexShader_LightDirection(void)
3496 int numvertices = dpsoftrast.numvertices;
3498 float LightVector[4];
3499 float EyePosition[4];
3500 float EyeVectorModelSpace[4];
3506 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3507 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3508 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3509 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3510 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3511 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3512 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3513 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3514 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3515 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3516 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3517 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3518 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3519 for (i = 0;i < numvertices;i++)
3521 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3522 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3523 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3524 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3525 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3526 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3527 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3528 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3529 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3530 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3531 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3532 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3533 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3534 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3535 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3538 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3539 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3540 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3541 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3542 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3543 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3544 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3545 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3547 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3548 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3549 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3551 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3554 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3555 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3556 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3557 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3558 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3559 #define DPSOFTRAST_Vector3Normalize(v)\
3562 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3573 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3575 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3576 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585 int x, startx = span->startx, endx = span->endx;
3586 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3587 float LightVectordata[4];
3588 float LightVectorslope[4];
3589 float EyeVectordata[4];
3590 float EyeVectorslope[4];
3591 float VectorSdata[4];
3592 float VectorSslope[4];
3593 float VectorTdata[4];
3594 float VectorTslope[4];
3595 float VectorRdata[4];
3596 float VectorRslope[4];
3598 float diffusetex[4];
3600 float surfacenormal[4];
3601 float lightnormal[4];
3602 float lightnormal_modelspace[4];
3604 float specularnormal[4];
3607 float SpecularPower;
3609 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3610 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3611 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3612 Color_Glow[3] = 0.0f;
3613 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3614 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3615 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3616 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3617 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3618 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3619 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3620 Color_Pants[3] = 0.0f;
3621 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3622 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3623 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3624 Color_Shirt[3] = 0.0f;
3625 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3626 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3629 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3632 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3634 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3638 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3639 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3640 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3641 Color_Diffuse[3] = 0.0f;
3642 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3643 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3644 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3645 LightColor[3] = 0.0f;
3646 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3648 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3649 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3650 Color_Specular[3] = 0.0f;
3651 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3652 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3653 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3657 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3658 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3659 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3660 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3661 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3663 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3665 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3666 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3668 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3670 // nothing of this needed
3674 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3677 for (x = startx;x < endx;x++)
3680 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3681 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3682 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3683 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3684 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3686 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3687 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3688 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3689 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3691 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3692 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3693 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3694 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3695 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3696 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3697 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3698 DPSOFTRAST_Vector3Normalize(surfacenormal);
3700 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3702 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3703 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3707 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3708 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3709 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3710 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3712 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3713 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3714 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3715 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3717 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3718 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3719 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3720 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3722 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3723 DPSOFTRAST_Vector3Normalize(lightnormal);
3725 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3727 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3728 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3729 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3730 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3733 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3735 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3736 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3737 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3739 float f = 1.0f / 256.0f;
3740 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3741 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3742 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3745 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3747 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3748 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3749 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3750 DPSOFTRAST_Vector3Normalize(lightnormal);
3752 LightColor[0] = 1.0;
3753 LightColor[1] = 1.0;
3754 LightColor[2] = 1.0;
3758 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3759 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3760 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3761 DPSOFTRAST_Vector3Normalize(lightnormal);
3764 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3766 if(thread->shader_exactspecularmath)
3768 // reflect lightnormal at surfacenormal, take the negative of that
3769 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3771 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3772 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3773 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3774 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3776 // dot of this and normalize(EyeVectorFogDepth.xyz)
3777 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3778 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3779 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3780 DPSOFTRAST_Vector3Normalize(eyenormal);
3782 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3786 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3787 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3788 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3789 DPSOFTRAST_Vector3Normalize(eyenormal);
3791 specularnormal[0] = lightnormal[0] + eyenormal[0];
3792 specularnormal[1] = lightnormal[1] + eyenormal[1];
3793 specularnormal[2] = lightnormal[2] + eyenormal[2];
3794 DPSOFTRAST_Vector3Normalize(specularnormal);
3796 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3799 specular = pow(specular, SpecularPower * glosstex[3]);
3800 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3802 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3803 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3804 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3805 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3809 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3810 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3811 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3812 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3815 buffer_FragColorbgra8[x*4+0] = d[0];
3816 buffer_FragColorbgra8[x*4+1] = d[1];
3817 buffer_FragColorbgra8[x*4+2] = d[2];
3818 buffer_FragColorbgra8[x*4+3] = d[3];
3821 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3823 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3824 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3825 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3826 Color_Diffuse[3] = 0.0f;
3827 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3828 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3829 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3830 LightColor[3] = 0.0f;
3831 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3833 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3835 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3836 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3837 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3838 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3839 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3841 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3843 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3844 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3846 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3848 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3852 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3855 for (x = startx;x < endx;x++)
3858 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3859 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3860 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3861 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3862 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3863 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3864 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3865 DPSOFTRAST_Vector3Normalize(surfacenormal);
3867 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3869 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3870 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3874 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3875 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3876 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3877 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3879 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3880 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3881 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3882 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3884 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3885 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3886 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3887 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3889 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3890 DPSOFTRAST_Vector3Normalize(lightnormal);
3892 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3894 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3895 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3896 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3897 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3900 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3902 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3903 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3904 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3906 float f = 1.0f / 256.0f;
3907 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3908 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3909 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3912 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3914 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3915 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3916 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3917 DPSOFTRAST_Vector3Normalize(lightnormal);
3919 LightColor[0] = 1.0;
3920 LightColor[1] = 1.0;
3921 LightColor[2] = 1.0;
3925 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3926 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3927 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3928 DPSOFTRAST_Vector3Normalize(lightnormal);
3931 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3932 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3934 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3935 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3936 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3937 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3941 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3942 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3943 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3944 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3946 buffer_FragColorbgra8[x*4+0] = d[0];
3947 buffer_FragColorbgra8[x*4+1] = d[1];
3948 buffer_FragColorbgra8[x*4+2] = d[2];
3949 buffer_FragColorbgra8[x*4+3] = d[3];
3954 for (x = startx;x < endx;x++)
3957 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3958 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3959 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3960 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3962 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3964 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3965 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3966 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3967 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3971 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3972 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3973 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3974 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3976 buffer_FragColorbgra8[x*4+0] = d[0];
3977 buffer_FragColorbgra8[x*4+1] = d[1];
3978 buffer_FragColorbgra8[x*4+2] = d[2];
3979 buffer_FragColorbgra8[x*4+3] = d[3];
3982 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3987 void DPSOFTRAST_VertexShader_LightSource(void)
3990 int numvertices = dpsoftrast.numvertices;
3991 float LightPosition[4];
3992 float LightVector[4];
3993 float LightVectorModelSpace[4];
3994 float EyePosition[4];
3995 float EyeVectorModelSpace[4];
4001 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4002 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4003 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4004 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4005 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4006 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4007 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4008 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4009 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4010 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4011 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4012 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4013 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4014 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4015 for (i = 0;i < numvertices;i++)
4017 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4018 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4019 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4020 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4021 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4022 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4023 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4024 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4025 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4026 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4027 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4028 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4029 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4030 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4031 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4032 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4033 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4034 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4035 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4036 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4037 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4038 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4039 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4040 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4041 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4042 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4043 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4044 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4045 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4046 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4047 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4048 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4050 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4051 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4054 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4057 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4058 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4060 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4061 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4062 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4063 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4064 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4065 int x, startx = span->startx, endx = span->endx;
4066 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4067 float CubeVectordata[4];
4068 float CubeVectorslope[4];
4069 float LightVectordata[4];
4070 float LightVectorslope[4];
4071 float EyeVectordata[4];
4072 float EyeVectorslope[4];
4074 float diffusetex[4];
4076 float surfacenormal[4];
4077 float lightnormal[4];
4079 float specularnormal[4];
4082 float SpecularPower;
4083 float CubeVector[4];
4086 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4087 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4088 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4089 Color_Glow[3] = 0.0f;
4090 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4091 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4092 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4093 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4094 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4095 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4096 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4097 Color_Diffuse[3] = 0.0f;
4098 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4099 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4100 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4101 Color_Specular[3] = 0.0f;
4102 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4103 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4104 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4105 Color_Pants[3] = 0.0f;
4106 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4107 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4108 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4109 Color_Shirt[3] = 0.0f;
4110 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4111 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4112 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4113 LightColor[3] = 0.0f;
4114 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4115 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4116 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4117 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4118 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4119 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4120 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4121 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4123 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4124 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4126 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4127 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4128 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4130 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4131 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132 for (x = startx;x < endx;x++)
4135 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4136 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4137 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4138 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4139 if (attenuation < 0.01f)
4141 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4143 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4144 if (attenuation < 0.01f)
4148 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4149 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4150 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4151 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4152 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4154 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4155 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4156 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4157 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4159 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4160 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4161 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4162 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4163 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4164 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4165 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4166 DPSOFTRAST_Vector3Normalize(surfacenormal);
4168 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4169 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4170 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4171 DPSOFTRAST_Vector3Normalize(lightnormal);
4173 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4175 if(thread->shader_exactspecularmath)
4177 // reflect lightnormal at surfacenormal, take the negative of that
4178 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4180 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4181 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4182 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4183 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4185 // dot of this and normalize(EyeVectorFogDepth.xyz)
4186 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4187 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4188 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4189 DPSOFTRAST_Vector3Normalize(eyenormal);
4191 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4195 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4196 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4197 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4198 DPSOFTRAST_Vector3Normalize(eyenormal);
4200 specularnormal[0] = lightnormal[0] + eyenormal[0];
4201 specularnormal[1] = lightnormal[1] + eyenormal[1];
4202 specularnormal[2] = lightnormal[2] + eyenormal[2];
4203 DPSOFTRAST_Vector3Normalize(specularnormal);
4205 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4207 specular = pow(specular, SpecularPower * glosstex[3]);
4209 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4211 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4212 attenuation *= (1.0f / 255.0f);
4213 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4214 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4215 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4216 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4220 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4221 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4222 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4223 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4225 buffer_FragColorbgra8[x*4+0] = d[0];
4226 buffer_FragColorbgra8[x*4+1] = d[1];
4227 buffer_FragColorbgra8[x*4+2] = d[2];
4228 buffer_FragColorbgra8[x*4+3] = d[3];
4231 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4234 for (x = startx;x < endx;x++)
4237 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4238 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4239 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4240 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4241 if (attenuation < 0.01f)
4243 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4245 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4246 if (attenuation < 0.01f)
4250 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4251 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4252 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4253 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4254 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4256 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4257 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4258 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4259 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4261 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4262 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4263 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4264 DPSOFTRAST_Vector3Normalize(surfacenormal);
4266 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4267 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4268 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4269 DPSOFTRAST_Vector3Normalize(lightnormal);
4271 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4272 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4274 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4275 attenuation *= (1.0f / 255.0f);
4276 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4277 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4278 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4279 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4283 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4284 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4285 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4286 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4288 buffer_FragColorbgra8[x*4+0] = d[0];
4289 buffer_FragColorbgra8[x*4+1] = d[1];
4290 buffer_FragColorbgra8[x*4+2] = d[2];
4291 buffer_FragColorbgra8[x*4+3] = d[3];
4296 for (x = startx;x < endx;x++)
4299 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4300 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4301 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4302 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4303 if (attenuation < 0.01f)
4305 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4307 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4308 if (attenuation < 0.01f)
4312 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4313 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4314 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4315 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4316 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4318 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4319 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4320 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4321 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4323 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4325 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4326 attenuation *= (1.0f / 255.0f);
4327 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4328 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4329 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4330 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4334 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4335 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4336 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4337 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4339 buffer_FragColorbgra8[x*4+0] = d[0];
4340 buffer_FragColorbgra8[x*4+1] = d[1];
4341 buffer_FragColorbgra8[x*4+2] = d[2];
4342 buffer_FragColorbgra8[x*4+3] = d[3];
4345 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4351 void DPSOFTRAST_VertexShader_Refraction(void)
4353 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4354 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4355 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4358 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4360 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4362 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4364 int x, startx = span->startx, endx = span->endx;
4367 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4368 //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4369 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4372 float ModelViewProjectionPositiondata[4];
4373 float ModelViewProjectionPositionslope[4];
4376 float ScreenScaleRefractReflect[2];
4377 float ScreenCenterRefractReflect[2];
4378 float DistortScaleRefractReflect[2];
4379 float RefractColor[4];
4381 const unsigned char * RESTRICT pixelbase;
4382 const unsigned char * RESTRICT pixel[4];
4383 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4384 if(!texture) return;
4385 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4388 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4389 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4390 //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4393 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4396 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4397 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4398 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4399 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4400 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4401 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4402 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4403 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4404 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4405 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4408 for (x = startx;x < endx;x++)
4410 float SafeScreenTexCoord[2];
4411 float ScreenTexCoord[2];
4418 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4419 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4421 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4422 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4423 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4425 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4426 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4427 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4428 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4429 DPSOFTRAST_Vector3Normalize(v);
4430 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4431 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4433 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4434 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4436 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4437 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4438 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4439 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4440 int tci[2] = { tc[0]>>16, tc[1]>>16 };
4441 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4442 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4443 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4444 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4445 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4446 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4447 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4448 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4449 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4450 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4451 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4452 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4456 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4457 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4458 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4459 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4460 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4461 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4462 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4468 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4469 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4470 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4471 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4472 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4475 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4480 void DPSOFTRAST_VertexShader_Water(void)
4482 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4486 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4489 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4490 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4491 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4492 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4493 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4498 void DPSOFTRAST_VertexShader_ShowDepth(void)
4500 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4503 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4506 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4507 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4508 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4509 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4510 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4515 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4517 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4520 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4523 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4524 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4525 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4526 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4527 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4532 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4534 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4537 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4540 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4541 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4542 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4543 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4544 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4549 typedef struct DPSOFTRAST_ShaderModeInfo_s
4552 void (*Vertex)(void);
4553 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4554 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4555 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4557 DPSOFTRAST_ShaderModeInfo;
4559 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4561 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4562 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4563 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4564 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4565 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4566 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4567 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4568 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4569 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4570 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4571 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4572 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4573 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4574 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4575 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4576 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4579 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4586 // unsigned int *colorpixel;
4587 unsigned int *depthpixel;
4593 DPSOFTRAST_State_Triangle *triangle;
4594 DPSOFTRAST_State_Span *span;
4595 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4596 for (i = 0; i < thread->numspans; i++)
4598 span = &thread->spans[i];
4599 triangle = &thread->triangles[span->triangle];
4600 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4602 wslope = triangle->w[0];
4603 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4604 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4605 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4606 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4607 startx = span->startx;
4609 switch(thread->fb_depthfunc)
4612 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4613 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4614 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4615 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4616 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4617 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4618 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4620 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4621 //for (x = startx;x < endx;x++)
4622 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4623 // if there is no color buffer, skip pixel shader
4624 while (startx < endx && !pixelmask[startx])
4626 while (endx > startx && !pixelmask[endx-1])
4629 continue; // no pixels to fill
4630 span->pixelmask = pixelmask;
4631 span->startx = startx;
4633 // run pixel shader if appropriate
4634 // do this before running depthmask code, to allow the pixelshader
4635 // to clear pixelmask values for alpha testing
4636 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4637 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4638 if (thread->depthmask)
4639 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4645 // no depth testing means we're just dealing with color...
4646 // if there is no color buffer, skip pixel shader
4647 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4649 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4650 span->pixelmask = pixelmask;
4651 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4655 thread->numspans = 0;
4658 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4660 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4663 int cullface = thread->cullface;
4664 int minx, maxx, miny, maxy;
4665 int miny1, maxy1, miny2, maxy2;
4666 __m128i fbmin, fbmax;
4667 __m128 viewportcenter, viewportscale;
4668 int firstvertex = command->firstvertex;
4669 int numvertices = command->numvertices;
4670 int numtriangles = command->numtriangles;
4671 const int *element3i = command->element3i;
4672 const unsigned short *element3s = command->element3s;
4673 int clipped = command->clipped;
4680 int starty, endy, bandy;
4684 __m128 triangleedge1, triangleedge2, trianglenormal;
4687 DPSOFTRAST_State_Triangle *triangle;
4688 DPSOFTRAST_Texture *texture;
4689 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4690 miny = thread->fb_scissor[1];
4691 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4692 miny1 = bound(miny, thread->miny1, maxy);
4693 maxy1 = bound(miny, thread->maxy1, maxy);
4694 miny2 = bound(miny, thread->miny2, maxy);
4695 maxy2 = bound(miny, thread->maxy2, maxy);
4696 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4698 if (!ATOMIC_DECREMENT(command->refcount))
4700 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4701 MM_FREE(command->arrays);
4705 minx = thread->fb_scissor[0];
4706 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4707 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4708 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4709 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4710 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4711 screen[3] = _mm_setzero_ps();
4712 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4713 for (i = 0;i < numtriangles;i++)
4715 const float *screencoord4f = command->arrays;
4716 const float *arrays = screencoord4f + numvertices*4;
4718 // generate the 3 edges of this triangle
4719 // generate spans for the triangle - switch based on left split or right split classification of triangle
4722 e[0] = element3s[i*3+0] - firstvertex;
4723 e[1] = element3s[i*3+1] - firstvertex;
4724 e[2] = element3s[i*3+2] - firstvertex;
4728 e[0] = element3i[i*3+0] - firstvertex;
4729 e[1] = element3i[i*3+1] - firstvertex;
4730 e[2] = element3i[i*3+2] - firstvertex;
4739 #define SKIPBACKFACE \
4740 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4741 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4742 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4743 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4744 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4748 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4752 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4757 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4758 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4760 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4761 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4763 #define CLIPPEDVERTEXCOPY(k,p1) \
4764 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4766 #define GENATTRIBCOPY(attrib, p1) \
4767 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4768 #define GENATTRIBLERP(attrib, p1, p2) \
4770 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4771 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4773 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4777 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4778 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4779 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4780 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4781 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4782 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4783 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4789 // calculate distance from nearplane
4790 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4791 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4792 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4793 if (clipdist[0] >= 0.0f)
4795 if (clipdist[1] >= 0.0f)
4797 if (clipdist[2] >= 0.0f)
4800 // triangle is entirely in front of nearplane
4801 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4808 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4816 if (clipdist[2] >= 0.0f)
4818 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4825 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4832 else if (clipdist[1] >= 0.0f)
4834 if (clipdist[2] >= 0.0f)
4836 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4843 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4849 else if (clipdist[2] >= 0.0f)
4851 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4856 else continue; // triangle is entirely behind nearplane
4859 // calculate integer y coords for triangle points
4860 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4861 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4862 screenmin = _mm_min_epi16(screeni, screenir),
4863 screenmax = _mm_max_epi16(screeni, screenir);
4864 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4865 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4866 screenmin = _mm_max_epi16(screenmin, fbmin);
4867 screenmax = _mm_min_epi16(screenmax, fbmax);
4868 // skip offscreen triangles
4869 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4871 starty = _mm_extract_epi16(screenmin, 1);
4872 endy = _mm_extract_epi16(screenmax, 1)+1;
4873 if (starty >= maxy1 && endy <= miny2)
4875 screeny = _mm_srai_epi32(screeni, 16);
4878 triangle = &thread->triangles[thread->numtriangles];
4880 // calculate attribute plans for triangle data...
4881 // okay, this triangle is going to produce spans, we'd better project
4882 // the interpolants now (this is what gives perspective texturing),
4883 // this consists of simply multiplying all arrays by the W coord
4884 // (which is basically 1/Z), which will be undone per-pixel
4885 // (multiplying by Z again) to get the perspective-correct array
4888 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4889 __m128 mipedgescale, mipdensity;
4890 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4891 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4892 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4893 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4894 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4895 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4896 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4897 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4898 attribedge1 = _mm_sub_ss(w0, w1);
4899 attribedge2 = _mm_sub_ss(w2, w1);
4900 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4901 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4902 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4903 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4904 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4905 _mm_store_ss(&triangle->w[0], attribxslope);
4906 _mm_store_ss(&triangle->w[1], attribyslope);
4907 _mm_store_ss(&triangle->w[2], attriborigin);
4908 mipedgescale = _mm_setzero_ps();
4909 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4911 __m128 attrib0, attrib1, attrib2;
4912 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4913 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4915 arrays += numvertices*4;
4916 GENATTRIBS(attrib0, attrib1, attrib2);
4917 attriborigin = _mm_mul_ps(attrib1, w1);
4918 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4919 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4920 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4921 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4922 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4923 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4924 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4925 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4926 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4928 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4929 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4930 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4931 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4935 memset(triangle->mip, 0, sizeof(triangle->mip));
4936 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4938 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4939 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4941 texture = thread->texbound[texunit];
4942 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4944 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4945 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4946 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4947 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4948 // this will be multiplied in the texturing routine by the texture resolution
4949 y = _mm_cvtss_si32(mipdensity);
4952 y = (int)(log((float)y)*0.5f/M_LN2);
4953 if (y > texture->mipmaps - 1)
4954 y = texture->mipmaps - 1;
4955 triangle->mip[texunit] = y;
4961 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4964 __m128 xcoords, xslope;
4965 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4966 int yccmask = _mm_movemask_epi8(ycc);
4967 int edge0p, edge0n, edge1p, edge1n;
4974 case 0xFFFF: /*0000*/ y = endy; continue;
4975 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4976 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4977 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4978 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4979 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4980 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4981 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4982 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4983 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4984 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4985 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4986 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4987 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4988 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4989 case 0x0000: /*1111*/ y++; continue;
4997 case 0xFFFF: /*000*/ y = endy; continue;
4998 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4999 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5000 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5001 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5002 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5003 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5004 case 0x0000: /*111*/ y++; continue;
5007 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5008 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5009 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5010 nexty = _mm_extract_epi16(ycc, 0);
5011 if (nexty >= bandy) nexty = bandy-1;
5012 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5013 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5014 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5015 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5016 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5017 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5019 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5020 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5022 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5024 int startx, endx, offset;
5025 startx = _mm_cvtss_si32(xcoords);
5026 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5029 if (startx < 0) startx = 0;
5030 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5032 if (endx > maxx) endx = maxx;
5033 if (startx >= endx) continue;
5034 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5036 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5037 span->triangle = thread->numtriangles;
5040 span->startx = max(minx - offset, 0);
5041 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5042 if (span->startx >= span->endx)
5044 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5045 DPSOFTRAST_Draw_ProcessSpans(thread);
5050 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5052 DPSOFTRAST_Draw_ProcessSpans(thread);
5053 thread->numtriangles = 0;
5057 if (!ATOMIC_DECREMENT(command->refcount))
5059 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5060 MM_FREE(command->arrays);
5063 if (thread->numspans > 0 || thread->numtriangles > 0)
5065 DPSOFTRAST_Draw_ProcessSpans(thread);
5066 thread->numtriangles = 0;
5071 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5075 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5076 int datasize = 2*numvertices*sizeof(float[4]);
5077 DPSOFTRAST_Command_Draw *command;
5078 unsigned char *data;
5079 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5081 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5082 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5084 datasize += numvertices*sizeof(float[4]);
5087 datasize += numtriangles*sizeof(unsigned short[3]);
5089 datasize += numtriangles*sizeof(int[3]);
5090 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5091 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5093 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5094 data = (unsigned char *)MM_CALLOC(datasize, 1);
5098 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5099 data = (unsigned char *)command + commandsize;
5101 command->firstvertex = firstvertex;
5102 command->numvertices = numvertices;
5103 command->numtriangles = numtriangles;
5104 command->arrays = (float *)data;
5105 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5106 dpsoftrast.firstvertex = firstvertex;
5107 dpsoftrast.numvertices = numvertices;
5108 dpsoftrast.screencoord4f = (float *)data;
5109 data += numvertices*sizeof(float[4]);
5110 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5111 data += numvertices*sizeof(float[4]);
5112 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5114 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5115 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5117 dpsoftrast.post_array4f[j] = (float *)data;
5118 data += numvertices*sizeof(float[4]);
5120 command->element3i = NULL;
5121 command->element3s = NULL;
5124 command->element3s = (unsigned short *)data;
5125 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5129 command->element3i = (int *)data;
5130 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5135 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5137 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5138 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5139 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5140 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5141 if (command->starty >= command->endy)
5143 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5144 MM_FREE(command->arrays);
5145 DPSOFTRAST_UndoCommand(command->commandsize);
5148 command->clipped = dpsoftrast.drawclipped;
5149 command->refcount = dpsoftrast.numthreads;
5151 if (dpsoftrast.usethreads)
5154 DPSOFTRAST_Draw_SyncCommands();
5155 for (i = 0; i < dpsoftrast.numthreads; i++)
5157 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5158 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5159 Thread_CondSignal(thread->drawcond);
5164 DPSOFTRAST_Draw_FlushThreads();
5168 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5170 int commandoffset = thread->commandoffset;
5171 while (commandoffset != endoffset)
5173 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5174 switch (command->opcode)
5176 #define INTERPCOMMAND(name) \
5177 case DPSOFTRAST_OPCODE_##name : \
5178 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5179 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5180 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5181 commandoffset = 0; \
5183 INTERPCOMMAND(Viewport)
5184 INTERPCOMMAND(ClearColor)
5185 INTERPCOMMAND(ClearDepth)
5186 INTERPCOMMAND(ColorMask)
5187 INTERPCOMMAND(DepthTest)
5188 INTERPCOMMAND(ScissorTest)
5189 INTERPCOMMAND(Scissor)
5190 INTERPCOMMAND(BlendFunc)
5191 INTERPCOMMAND(BlendSubtract)
5192 INTERPCOMMAND(DepthMask)
5193 INTERPCOMMAND(DepthFunc)
5194 INTERPCOMMAND(DepthRange)
5195 INTERPCOMMAND(PolygonOffset)
5196 INTERPCOMMAND(CullFace)
5197 INTERPCOMMAND(AlphaTest)
5198 INTERPCOMMAND(AlphaFunc)
5199 INTERPCOMMAND(SetTexture)
5200 INTERPCOMMAND(SetShader)
5201 INTERPCOMMAND(Uniform4f)
5202 INTERPCOMMAND(UniformMatrix4f)
5203 INTERPCOMMAND(Uniform1i)
5205 case DPSOFTRAST_OPCODE_Draw:
5206 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5207 commandoffset += command->commandsize;
5208 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5210 thread->commandoffset = commandoffset;
5213 case DPSOFTRAST_OPCODE_Reset:
5218 thread->commandoffset = commandoffset;
5221 static int DPSOFTRAST_Draw_Thread(void *data)
5223 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5224 while(thread->index >= 0)
5226 if (thread->commandoffset != dpsoftrast.drawcommand)
5228 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5232 Thread_LockMutex(thread->drawmutex);
5233 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5235 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5236 thread->starving = true;
5237 Thread_CondWait(thread->drawcond, thread->drawmutex);
5238 thread->starving = false;
5240 Thread_UnlockMutex(thread->drawmutex);
5246 static void DPSOFTRAST_Draw_FlushThreads(void)
5248 DPSOFTRAST_State_Thread *thread;
5250 DPSOFTRAST_Draw_SyncCommands();
5251 if (dpsoftrast.usethreads)
5253 for (i = 0; i < dpsoftrast.numthreads; i++)
5255 thread = &dpsoftrast.threads[i];
5256 if (thread->commandoffset != dpsoftrast.drawcommand)
5258 Thread_LockMutex(thread->drawmutex);
5259 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5260 Thread_CondSignal(thread->drawcond);
5261 Thread_UnlockMutex(thread->drawmutex);
5264 for (i = 0; i < dpsoftrast.numthreads; i++)
5266 thread = &dpsoftrast.threads[i];
5267 if (thread->commandoffset != dpsoftrast.drawcommand)
5269 Thread_LockMutex(thread->drawmutex);
5270 if (thread->commandoffset != dpsoftrast.drawcommand)
5272 thread->waiting = true;
5273 Thread_CondWait(thread->waitcond, thread->drawmutex);
5274 thread->waiting = false;
5276 Thread_UnlockMutex(thread->drawmutex);
5282 for (i = 0; i < dpsoftrast.numthreads; i++)
5284 thread = &dpsoftrast.threads[i];
5285 if (thread->commandoffset != dpsoftrast.drawcommand)
5286 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5289 dpsoftrast.commandpool.usedcommands = 0;
5292 void DPSOFTRAST_Flush(void)
5294 DPSOFTRAST_Draw_FlushThreads();
5297 void DPSOFTRAST_Finish(void)
5302 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5312 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5313 dpsoftrast.bigendian = u.b[3];
5314 dpsoftrast.fb_width = width;
5315 dpsoftrast.fb_height = height;
5316 dpsoftrast.fb_depthpixels = depthpixels;
5317 dpsoftrast.fb_colorpixels[0] = colorpixels;
5318 dpsoftrast.fb_colorpixels[1] = NULL;
5319 dpsoftrast.fb_colorpixels[1] = NULL;
5320 dpsoftrast.fb_colorpixels[1] = NULL;
5321 dpsoftrast.viewport[0] = 0;
5322 dpsoftrast.viewport[1] = 0;
5323 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5324 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5325 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5326 dpsoftrast.texture_firstfree = 1;
5327 dpsoftrast.texture_end = 1;
5328 dpsoftrast.texture_max = 0;
5329 dpsoftrast.color[0] = 1;
5330 dpsoftrast.color[1] = 1;
5331 dpsoftrast.color[2] = 1;
5332 dpsoftrast.color[3] = 1;
5333 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5334 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5335 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5336 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5337 for (i = 0; i < dpsoftrast.numthreads; i++)
5339 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5341 thread->cullface = GL_BACK;
5342 thread->colormask[1] = 1;
5343 thread->colormask[2] = 1;
5344 thread->colormask[3] = 1;
5345 thread->blendfunc[0] = GL_ONE;
5346 thread->blendfunc[1] = GL_ZERO;
5347 thread->depthmask = true;
5348 thread->depthtest = true;
5349 thread->depthfunc = GL_LEQUAL;
5350 thread->scissortest = false;
5351 thread->alphatest = false;
5352 thread->alphafunc = GL_GREATER;
5353 thread->alphavalue = 0.5f;
5354 thread->viewport[0] = 0;
5355 thread->viewport[1] = 0;
5356 thread->viewport[2] = dpsoftrast.fb_width;
5357 thread->viewport[3] = dpsoftrast.fb_height;
5358 thread->scissor[0] = 0;
5359 thread->scissor[1] = 0;
5360 thread->scissor[2] = dpsoftrast.fb_width;
5361 thread->scissor[3] = dpsoftrast.fb_height;
5362 thread->depthrange[0] = 0;
5363 thread->depthrange[1] = 1;
5364 thread->polygonoffset[0] = 0;
5365 thread->polygonoffset[1] = 0;
5367 if (dpsoftrast.interlace)
5369 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5370 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5371 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5372 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5376 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5377 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5380 thread->numspans = 0;
5381 thread->numtriangles = 0;
5382 thread->commandoffset = 0;
5383 thread->waiting = false;
5384 thread->starving = false;
5386 thread->validate = -1;
5387 DPSOFTRAST_Validate(thread, -1);
5389 if (dpsoftrast.usethreads)
5391 thread->waitcond = Thread_CreateCond();
5392 thread->drawcond = Thread_CreateCond();
5393 thread->drawmutex = Thread_CreateMutex();
5394 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5400 void DPSOFTRAST_Shutdown(void)
5403 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5405 DPSOFTRAST_State_Thread *thread;
5406 for (i = 0; i < dpsoftrast.numthreads; i++)
5408 thread = &dpsoftrast.threads[i];
5409 Thread_LockMutex(thread->drawmutex);
5411 Thread_CondSignal(thread->drawcond);
5412 Thread_UnlockMutex(thread->drawmutex);
5413 Thread_WaitThread(thread->thread, 0);
5414 Thread_DestroyCond(thread->waitcond);
5415 Thread_DestroyCond(thread->drawcond);
5416 Thread_DestroyMutex(thread->drawmutex);
5419 for (i = 0;i < dpsoftrast.texture_end;i++)
5420 if (dpsoftrast.texture[i].bytes)
5421 MM_FREE(dpsoftrast.texture[i].bytes);
5422 if (dpsoftrast.texture)
5423 free(dpsoftrast.texture);
5424 if (dpsoftrast.threads)
5425 MM_FREE(dpsoftrast.threads);
5426 memset(&dpsoftrast, 0, sizeof(dpsoftrast));