3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if (ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int startx; // usable range (according to pixelmask)
183 int endx; // usable range (according to pixelmask)
184 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 DPSOFTRAST_State_Span);
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196 typedef enum DPSOFTRAST_BLENDMODE_e
198 DPSOFTRAST_BLENDMODE_OPAQUE,
199 DPSOFTRAST_BLENDMODE_ALPHA,
200 DPSOFTRAST_BLENDMODE_ADDALPHA,
201 DPSOFTRAST_BLENDMODE_ADD,
202 DPSOFTRAST_BLENDMODE_INVMOD,
203 DPSOFTRAST_BLENDMODE_MUL,
204 DPSOFTRAST_BLENDMODE_MUL2,
205 DPSOFTRAST_BLENDMODE_SUBALPHA,
206 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207 DPSOFTRAST_BLENDMODE_INVADD,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262 ATOMIC(volatile int commandoffset);
264 volatile bool waiting;
265 volatile bool starving;
268 SDL_mutex *drawmutex;
272 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
275 DPSOFTRAST_State_Thread);
277 typedef ATOMIC(struct DPSOFTRAST_State_s
281 unsigned int *fb_depthpixels;
282 unsigned int *fb_colorpixels[4];
285 ALIGN(float fb_viewportcenter[4]);
286 ALIGN(float fb_viewportscale[4]);
289 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
292 const float *pointer_vertex3f;
293 const float *pointer_color4f;
294 const unsigned char *pointer_color4ub;
295 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305 float *screencoord4f;
311 int shader_permutation;
315 int texture_firstfree;
316 DPSOFTRAST_Texture *texture;
321 const char *errorstring;
325 DPSOFTRAST_State_Thread *threads;
327 ATOMIC(volatile int drawcommand);
329 DPSOFTRAST_State_Command_Pool commandpool;
333 DPSOFTRAST_State dpsoftrast;
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
343 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345 fb_viewportcenter[3] = 0.5f;
346 fb_viewportcenter[0] = 0.0f;
347 fb_viewportscale[1] = 0.5f * viewport[2];
348 fb_viewportscale[2] = -0.5f * viewport[3];
349 fb_viewportscale[3] = 0.5f;
350 fb_viewportscale[0] = 1.0f;
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
355 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356 // and viewport projection values
359 x1 = thread->scissor[0];
360 x2 = thread->scissor[0] + thread->scissor[2];
361 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362 y2 = dpsoftrast.fb_height - thread->scissor[1];
363 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
365 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
367 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368 thread->fb_scissor[0] = x1;
369 thread->fb_scissor[1] = y1;
370 thread->fb_scissor[2] = x2 - x1;
371 thread->fb_scissor[3] = y2 - y1;
373 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
378 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
383 if (thread->blendsubtract)
385 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
387 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
395 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
397 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
407 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
416 mask &= thread->validate;
419 if (mask & DPSOFTRAST_VALIDATE_FB)
421 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422 DPSOFTRAST_RecalcFB(thread);
424 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427 DPSOFTRAST_RecalcDepthFunc(thread);
429 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432 DPSOFTRAST_RecalcBlendFunc(thread);
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
438 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439 return &dpsoftrast.texture[index];
443 static void DPSOFTRAST_Texture_Grow(void)
445 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446 DPSOFTRAST_State_Thread *thread;
450 // expand texture array as needed
451 if (dpsoftrast.texture_max < 1024)
452 dpsoftrast.texture_max = 1024;
454 dpsoftrast.texture_max *= 2;
455 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (dpsoftrast.texbound[i])
458 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459 for (j = 0; j < dpsoftrast.numthreads; j++)
461 thread = &dpsoftrast.threads[j];
462 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463 if (thread->texbound[i])
464 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
477 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479 DPSOFTRAST_Texture *texture;
480 if (width*height*depth < 1)
482 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
485 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
492 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
499 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
514 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
516 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
534 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
539 // find first empty slot in texture array
540 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541 if (!dpsoftrast.texture[texnum].bytes)
543 dpsoftrast.texture_firstfree = texnum + 1;
544 if (dpsoftrast.texture_max <= texnum)
545 DPSOFTRAST_Texture_Grow();
546 if (dpsoftrast.texture_end <= texnum)
547 dpsoftrast.texture_end = texnum + 1;
548 texture = &dpsoftrast.texture[texnum];
549 memset(texture, 0, sizeof(*texture));
550 texture->flags = flags;
551 texture->width = width;
552 texture->height = height;
553 texture->depth = depth;
554 texture->sides = sides;
566 s = w * h * d * sides * 4;
567 texture->mipmap[mipmaps][0] = size;
568 texture->mipmap[mipmaps][1] = s;
569 texture->mipmap[mipmaps][2] = w;
570 texture->mipmap[mipmaps][3] = h;
571 texture->mipmap[mipmaps][4] = d;
574 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 texture->mipmaps = mipmaps;
581 texture->size = size;
583 // allocate the pixels now
584 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588 void DPSOFTRAST_Texture_Free(int index)
590 DPSOFTRAST_Texture *texture;
591 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595 MM_FREE(texture->bytes);
596 texture->bytes = NULL;
597 memset(texture, 0, sizeof(*texture));
598 // adjust the free range and used range
599 if (dpsoftrast.texture_firstfree > index)
600 dpsoftrast.texture_firstfree = index;
601 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602 dpsoftrast.texture_end--;
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
606 int i, x, y, z, w, layer0, layer1, row0, row1;
607 unsigned char *o, *i0, *i1, *i2, *i3;
608 DPSOFTRAST_Texture *texture;
609 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610 if (texture->mipmaps <= 1)
612 for (i = 1;i < texture->mipmaps;i++)
614 for (z = 0;z < texture->mipmap[i][4];z++)
618 if (layer1 >= texture->mipmap[i-1][4])
619 layer1 = texture->mipmap[i-1][4]-1;
620 for (y = 0;y < texture->mipmap[i][3];y++)
624 if (row1 >= texture->mipmap[i-1][3])
625 row1 = texture->mipmap[i-1][3]-1;
626 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
627 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631 w = texture->mipmap[i][2];
634 if (texture->mipmap[i-1][2] > 1)
636 // average 3D texture
637 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
639 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
647 // average 3D mipmap with parent width == 1
648 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
650 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659 if (texture->mipmap[i-1][2] > 1)
661 // average 2D texture (common case)
662 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
664 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
672 // 2D texture with parent width == 1
673 o[0] = (i0[0] + i1[0] + 1) >> 1;
674 o[1] = (i0[1] + i1[1] + 1) >> 1;
675 o[2] = (i0[2] + i1[2] + 1) >> 1;
676 o[3] = (i0[3] + i1[3] + 1) >> 1;
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
685 DPSOFTRAST_Texture *texture;
687 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691 while (blockheight > 0)
693 memcpy(dst, pixels, blockwidth * 4);
694 pixels += blockwidth * 4;
695 dst += texture->mipmap[0][2] * 4;
698 DPSOFTRAST_Texture_CalculateMipmaps(index);
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
702 DPSOFTRAST_Texture *texture;
703 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
706 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707 DPSOFTRAST_Texture_CalculateMipmaps(index);
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][2];
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][3];
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725 return texture->mipmap[mip][4];
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
729 DPSOFTRAST_Texture *texture;
730 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
733 return texture->bytes + texture->mipmap[mip][0];
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
741 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
746 texture->filter = filter;
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
751 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
755 dpsoftrast.fb_width = width;
756 dpsoftrast.fb_height = height;
757 dpsoftrast.fb_depthpixels = depthpixels;
758 dpsoftrast.fb_colorpixels[0] = colorpixels0;
759 dpsoftrast.fb_colorpixels[1] = colorpixels1;
760 dpsoftrast.fb_colorpixels[2] = colorpixels2;
761 dpsoftrast.fb_colorpixels[3] = colorpixels3;
764 static void DPSOFTRAST_Draw_FlushThreads(void);
766 static void DPSOFTRAST_Draw_SyncCommands(void)
769 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
775 DPSOFTRAST_State_Thread *thread;
777 int freecommand = dpsoftrast.commandpool.freecommand;
778 int usedcommands = dpsoftrast.commandpool.usedcommands;
779 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
781 DPSOFTRAST_Draw_SyncCommands();
787 for (i = 0; i < dpsoftrast.numthreads; i++)
789 thread = &dpsoftrast.threads[i];
790 commandoffset = freecommand - thread->commandoffset;
791 if (commandoffset < 0)
792 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793 if (commandoffset > usedcommands)
796 usedcommands = commandoffset;
799 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
801 thread = &dpsoftrast.threads[waitindex];
802 SDL_LockMutex(thread->drawmutex);
803 if (thread->commandoffset != dpsoftrast.drawcommand)
805 thread->waiting = true;
806 if (thread->starving) SDL_CondSignal(thread->drawcond);
807 SDL_CondWait(thread->waitcond, thread->drawmutex);
808 thread->waiting = false;
810 SDL_UnlockMutex(thread->drawmutex);
812 dpsoftrast.commandpool.usedcommands = usedcommands;
814 DPSOFTRAST_Draw_FlushThreads();
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
825 DPSOFTRAST_Command *command;
826 int freecommand = dpsoftrast.commandpool.freecommand;
827 int usedcommands = dpsoftrast.commandpool.usedcommands;
828 int extra = sizeof(DPSOFTRAST_Command);
829 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834 freecommand = dpsoftrast.commandpool.freecommand;
835 usedcommands = dpsoftrast.commandpool.usedcommands;
837 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
839 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840 command->opcode = DPSOFTRAST_OPCODE_Reset;
841 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845 command->opcode = opcode;
846 command->commandsize = size;
848 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
850 dpsoftrast.commandpool.freecommand = freecommand;
851 dpsoftrast.commandpool.usedcommands = usedcommands + size;
855 static void DPSOFTRAST_UndoCommand(int size)
857 int freecommand = dpsoftrast.commandpool.freecommand;
858 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862 usedcommands -= size;
863 dpsoftrast.commandpool.freecommand = freecommand;
864 dpsoftrast.commandpool.usedcommands = usedcommands;
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
870 thread->viewport[0] = command->x;
871 thread->viewport[1] = command->y;
872 thread->viewport[2] = command->width;
873 thread->viewport[3] = command->height;
874 thread->validate |= DPSOFTRAST_VALIDATE_FB;
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
878 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881 command->width = width;
882 command->height = height;
884 dpsoftrast.viewport[0] = x;
885 dpsoftrast.viewport[1] = y;
886 dpsoftrast.viewport[2] = width;
887 dpsoftrast.viewport[3] = height;
888 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
894 int i, x1, y1, x2, y2, w, h, x, y;
895 int miny1 = thread->miny1;
896 int maxy1 = thread->maxy1;
897 int miny2 = thread->miny2;
898 int maxy2 = thread->maxy2;
902 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903 x1 = thread->fb_scissor[0];
904 y1 = thread->fb_scissor[1];
905 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
906 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
907 if (y1 < miny1) y1 = miny1;
908 if (y2 > maxy2) y2 = maxy2;
913 // FIXME: honor fb_colormask?
914 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915 for (i = 0;i < 4;i++)
917 if (!dpsoftrast.fb_colorpixels[i])
919 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923 for (x = x1;x < x2;x++)
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
930 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
940 int x1, y1, x2, y2, w, h, x, y;
941 int miny1 = thread->miny1;
942 int maxy1 = thread->maxy1;
943 int miny2 = thread->miny2;
944 int maxy2 = thread->maxy2;
948 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949 x1 = thread->fb_scissor[0];
950 y1 = thread->fb_scissor[1];
951 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953 if (y1 < miny1) y1 = miny1;
954 if (y2 > maxy2) y2 = maxy2;
959 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964 for (x = x1;x < x2;x++)
968 void DPSOFTRAST_ClearDepth(float d)
970 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
977 thread->colormask[0] = command->r != 0;
978 thread->colormask[1] = command->g != 0;
979 thread->colormask[2] = command->b != 0;
980 thread->colormask[3] = command->a != 0;
981 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
985 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
995 thread->depthtest = command->enable;
996 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
998 void DPSOFTRAST_DepthTest(int enable)
1000 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001 command->enable = enable;
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1007 thread->scissortest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 void DPSOFTRAST_ScissorTest(int enable)
1012 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013 command->enable = enable;
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1019 thread->scissor[0] = command->x;
1020 thread->scissor[1] = command->y;
1021 thread->scissor[2] = command->width;
1022 thread->scissor[3] = command->height;
1023 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1027 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030 command->width = width;
1031 command->height = height;
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 thread->blendfunc[0] = command->sfactor;
1038 thread->blendfunc[1] = command->dfactor;
1039 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044 command->sfactor = sfactor;
1045 command->dfactor = dfactor;
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1051 thread->blendsubtract = command->enable;
1052 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1054 void DPSOFTRAST_BlendSubtract(int enable)
1056 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057 command->enable = enable;
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1063 thread->depthmask = command->enable;
1065 void DPSOFTRAST_DepthMask(int enable)
1067 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068 command->enable = enable;
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1074 thread->depthfunc = command->func;
1076 void DPSOFTRAST_DepthFunc(int func)
1078 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079 command->func = func;
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1085 thread->depthrange[0] = command->nearval;
1086 thread->depthrange[1] = command->farval;
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1090 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091 command->nearval = nearval;
1092 command->farval = farval;
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1098 thread->polygonoffset[0] = command->alongnormal;
1099 thread->polygonoffset[1] = command->intoview;
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1103 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104 command->alongnormal = alongnormal;
1105 command->intoview = intoview;
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1111 thread->cullface = command->mode;
1113 void DPSOFTRAST_CullFace(int mode)
1115 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116 command->mode = mode;
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 thread->alphatest = command->enable;
1124 void DPSOFTRAST_AlphaTest(int enable)
1126 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127 command->enable = enable;
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1133 thread->alphafunc = command->func;
1134 thread->alphavalue = command->ref;
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1138 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139 command->func = func;
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1145 dpsoftrast.color[0] = r;
1146 dpsoftrast.color[1] = g;
1147 dpsoftrast.color[2] = b;
1148 dpsoftrast.color[3] = a;
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1153 int outstride = blockwidth * 4;
1154 int instride = dpsoftrast.fb_width * 4;
1157 int bx2 = blockx + blockwidth;
1158 int by2 = blocky + blockheight;
1163 unsigned char *inpixels;
1167 if (bx1 < 0) bx1 = 0;
1168 if (by1 < 0) by1 = 0;
1169 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1228 spixels = dpsoftrast.fb_colorpixels[0];
1229 swidth = dpsoftrast.fb_width;
1230 sheight = dpsoftrast.fb_height;
1231 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232 twidth = texture->mipmap[mip][2];
1233 theight = texture->mipmap[mip][3];
1234 if (tx1 < 0) tx1 = 0;
1235 if (ty1 < 0) ty1 = 0;
1236 if (tx2 > twidth) tx2 = twidth;
1237 if (ty2 > theight) ty2 = theight;
1238 if (sx1 < 0) sx1 = 0;
1239 if (sy1 < 0) sy1 = 0;
1240 if (sx2 > swidth) sx2 = swidth;
1241 if (sy2 > sheight) sy2 = sheight;
1246 if (tw > sw) tw = sw;
1247 if (th > sh) th = sh;
1248 if (tw < 1 || th < 1)
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1769 float *end = out4f + numitems*4;
1770 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771 __m128 minpos, maxpos;
1772 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1774 minpos = maxpos = _mm_loadu_ps(in4f);
1777 __m128 v = _mm_loadu_ps(in4f);
1778 minpos = _mm_min_ps(minpos, v);
1779 maxpos = _mm_max_ps(maxpos, v);
1780 _mm_store_ps(out4f, v);
1781 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782 _mm_store_ps(screen4f, v);
1790 minpos = maxpos = _mm_load_ps(in4f);
1793 __m128 v = _mm_load_ps(in4f);
1794 minpos = _mm_min_ps(minpos, v);
1795 maxpos = _mm_max_ps(maxpos, v);
1796 _mm_store_ps(out4f, v);
1797 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798 _mm_store_ps(screen4f, v);
1805 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1806 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1817 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1820 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822 end = out4f + numitems*4;
1823 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825 m0 = _mm_loadu_ps(inmatrix16f);
1826 m1 = _mm_loadu_ps(inmatrix16f + 4);
1827 m2 = _mm_loadu_ps(inmatrix16f + 8);
1828 m3 = _mm_loadu_ps(inmatrix16f + 12);
1829 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1831 minpos = maxpos = _mm_loadu_ps(in4f);
1834 __m128 v = _mm_loadu_ps(in4f);
1835 minpos = _mm_min_ps(minpos, v);
1836 maxpos = _mm_max_ps(maxpos, v);
1837 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838 _mm_store_ps(out4f, v);
1839 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840 _mm_store_ps(screen4f, v);
1848 minpos = maxpos = _mm_load_ps(in4f);
1851 __m128 v = _mm_load_ps(in4f);
1852 minpos = _mm_min_ps(minpos, v);
1853 maxpos = _mm_max_ps(maxpos, v);
1854 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855 _mm_store_ps(out4f, v);
1856 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857 _mm_store_ps(screen4f, v);
1864 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1871 float *outf = dpsoftrast.post_array4f[outarray];
1872 const unsigned char *inb;
1873 int firstvertex = dpsoftrast.firstvertex;
1874 int numvertices = dpsoftrast.numvertices;
1878 case DPSOFTRAST_ARRAY_POSITION:
1879 stride = dpsoftrast.stride_vertex;
1880 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1883 case DPSOFTRAST_ARRAY_COLOR:
1884 stride = dpsoftrast.stride_color;
1885 if (dpsoftrast.pointer_color4f)
1887 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1890 else if (dpsoftrast.pointer_color4ub)
1892 stride = dpsoftrast.stride_color;
1893 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1902 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1905 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1909 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1915 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1934 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1942 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1950 int startx = span->startx;
1951 int endx = span->endx;
1952 float wslope = triangle->w[0];
1953 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954 float endz = 1.0f / (w + wslope * startx);
1955 for (x = startx;x < endx;)
1957 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959 if (nextsub >= endx) nextsub = endsub = endx-1;
1960 endz = 1.0f / (w + wslope * nextsub);
1961 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962 for (; x <= endsub; x++, z += dz)
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1970 int startx = span->startx;
1971 int endx = span->endx;
1974 unsigned char * RESTRICT pixelmask = span->pixelmask;
1975 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1978 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979 // handle alphatest now (this affects depth writes too)
1980 if (thread->alphatest)
1981 for (x = startx;x < endx;x++)
1982 if (in4f[x*4+3] < 0.5f)
1983 pixelmask[x] = false;
1984 // FIXME: this does not handle bigendian
1985 switch(thread->fb_blendmode)
1987 case DPSOFTRAST_BLENDMODE_OPAQUE:
1988 for (x = startx;x < endx;x++)
1992 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ADD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_INVMOD:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_MUL2:
2081 for (x = startx;x < endx;x++)
2085 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089 pixel[x*4+0] = d[0];
2090 pixel[x*4+1] = d[1];
2091 pixel[x*4+2] = d[2];
2092 pixel[x*4+3] = d[3];
2095 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096 for (x = startx;x < endx;x++)
2100 a = in4f[x*4+3] * -255.0f;
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2111 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112 for (x = startx;x < endx;x++)
2117 b = 1.0f - in4f[x*4+3];
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_INVADD:
2129 for (x = startx;x < endx;x++)
2133 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137 pixel[x*4+0] = d[0];
2138 pixel[x*4+1] = d[1];
2139 pixel[x*4+2] = d[2];
2140 pixel[x*4+3] = d[3];
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2150 int startx = span->startx;
2151 int endx = span->endx;
2152 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153 unsigned char * RESTRICT pixelmask = span->pixelmask;
2154 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2158 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159 pixeli += span->y * dpsoftrast.fb_width + span->x;
2160 // handle alphatest now (this affects depth writes too)
2161 if (thread->alphatest)
2162 for (x = startx;x < endx;x++)
2163 if (in4ub[x*4+3] < 0.5f)
2164 pixelmask[x] = false;
2165 // FIXME: this does not handle bigendian
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 for (x = startx;x + 4 <= endx;)
2171 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2173 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187 case DPSOFTRAST_BLENDMODE_ALPHA:
2188 #define FINISHBLEND(blend2, blend1) \
2189 for (x = startx;x + 1 < endx;x += 2) \
2192 switch (*(const unsigned short*)&pixelmask[x]) \
2195 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2198 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2201 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2204 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2207 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2210 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 for(;x < endx; x++) \
2218 if (!pixelmask[x]) \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2291 int startx = span->startx;
2292 int endx = span->endx;
2297 float tc[2], endtc[2];
2299 unsigned int tci[2];
2300 unsigned int tci1[2];
2301 unsigned int tcimin[2];
2302 unsigned int tcimax[2];
2307 const unsigned char * RESTRICT pixelbase;
2308 const unsigned char * RESTRICT pixel[4];
2309 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2310 // if no texture is bound, just fill it with white
2313 for (x = startx;x < endx;x++)
2315 out4f[x*4+0] = 1.0f;
2316 out4f[x*4+1] = 1.0f;
2317 out4f[x*4+2] = 1.0f;
2318 out4f[x*4+3] = 1.0f;
2322 mip = triangle->mip[texunitindex];
2323 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2324 // if this mipmap of the texture is 1 pixel, just fill it with that color
2325 if (texture->mipmap[mip][1] == 4)
2327 c[0] = texture->bytes[2] * (1.0f/255.0f);
2328 c[1] = texture->bytes[1] * (1.0f/255.0f);
2329 c[2] = texture->bytes[0] * (1.0f/255.0f);
2330 c[3] = texture->bytes[3] * (1.0f/255.0f);
2331 for (x = startx;x < endx;x++)
2333 out4f[x*4+0] = c[0];
2334 out4f[x*4+1] = c[1];
2335 out4f[x*4+2] = c[2];
2336 out4f[x*4+3] = c[3];
2340 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2341 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2342 flags = texture->flags;
2343 tcscale[0] = texture->mipmap[mip][2];
2344 tcscale[1] = texture->mipmap[mip][3];
2345 tciwidth = texture->mipmap[mip][2];
2348 tcimax[0] = texture->mipmap[mip][2]-1;
2349 tcimax[1] = texture->mipmap[mip][3]-1;
2350 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2351 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2352 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2353 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2354 for (x = startx;x < endx;)
2356 unsigned int subtc[2];
2357 unsigned int substep[2];
2358 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2359 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2360 if (nextsub >= endx)
2362 nextsub = endsub = endx-1;
2363 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2367 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2368 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2369 substep[0] = (endtc[0] - tc[0]) * subscale;
2370 substep[1] = (endtc[1] - tc[1]) * subscale;
2371 subtc[0] = tc[0] * (1<<16);
2372 subtc[1] = tc[1] * (1<<16);
2375 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2377 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2379 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2380 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2381 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2382 tci[0] = subtc[0]>>16;
2383 tci[1] = subtc[1]>>16;
2384 tci1[0] = tci[0] + 1;
2385 tci1[1] = tci[1] + 1;
2386 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2387 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2388 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2389 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2390 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2391 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2392 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2393 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2394 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2397 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2398 out4f[x*4+0] = c[0];
2399 out4f[x*4+1] = c[1];
2400 out4f[x*4+2] = c[2];
2401 out4f[x*4+3] = c[3];
2406 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2408 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2409 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2410 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2411 tci[0] = subtc[0]>>16;
2412 tci[1] = subtc[1]>>16;
2413 tci1[0] = tci[0] + 1;
2414 tci1[1] = tci[1] + 1;
2415 tci[0] &= tciwrapmask[0];
2416 tci[1] &= tciwrapmask[1];
2417 tci1[0] &= tciwrapmask[0];
2418 tci1[1] &= tciwrapmask[1];
2419 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2420 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2421 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2422 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2423 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2426 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2427 out4f[x*4+0] = c[0];
2428 out4f[x*4+1] = c[1];
2429 out4f[x*4+2] = c[2];
2430 out4f[x*4+3] = c[3];
2434 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2436 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2438 tci[0] = subtc[0]>>16;
2439 tci[1] = subtc[1]>>16;
2440 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2443 c[0] = pixel[0][2] * (1.0f / 255.0f);
2444 c[1] = pixel[0][1] * (1.0f / 255.0f);
2445 c[2] = pixel[0][0] * (1.0f / 255.0f);
2446 c[3] = pixel[0][3] * (1.0f / 255.0f);
2447 out4f[x*4+0] = c[0];
2448 out4f[x*4+1] = c[1];
2449 out4f[x*4+2] = c[2];
2450 out4f[x*4+3] = c[3];
2455 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457 tci[0] = subtc[0]>>16;
2458 tci[1] = subtc[1]>>16;
2459 tci[0] &= tciwrapmask[0];
2460 tci[1] &= tciwrapmask[1];
2461 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462 c[0] = pixel[0][2] * (1.0f / 255.0f);
2463 c[1] = pixel[0][1] * (1.0f / 255.0f);
2464 c[2] = pixel[0][0] * (1.0f / 255.0f);
2465 c[3] = pixel[0][3] * (1.0f / 255.0f);
2466 out4f[x*4+0] = c[0];
2467 out4f[x*4+1] = c[1];
2468 out4f[x*4+2] = c[2];
2469 out4f[x*4+3] = c[3];
2475 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2479 int startx = span->startx;
2480 int endx = span->endx;
2482 __m128 data, slope, tcscale;
2483 __m128i tcsize, tcmask, tcoffset, tcmax;
2485 __m128i subtc, substep, endsubtc;
2488 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2489 const unsigned char * RESTRICT pixelbase;
2490 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2491 // if no texture is bound, just fill it with white
2494 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2497 mip = triangle->mip[texunitindex];
2498 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499 // if this mipmap of the texture is 1 pixel, just fill it with that color
2500 if (texture->mipmap[mip][1] == 4)
2502 unsigned int k = *((const unsigned int *)pixelbase);
2503 for (x = startx;x < endx;x++)
2507 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2508 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2509 flags = texture->flags;
2510 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2511 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2512 tcscale = _mm_cvtepi32_ps(tcsize);
2513 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2514 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2515 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2516 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2517 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2518 tcmax = _mm_packs_epi32(tcmask, tcmask);
2519 for (x = startx;x < endx;)
2521 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2522 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2523 if (nextsub >= endx)
2525 nextsub = endsub = endx-1;
2526 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2530 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2531 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2532 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2533 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2534 substep = _mm_slli_epi32(substep, 1);
2537 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2538 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2540 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2541 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2543 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2544 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2545 tci = _mm_madd_epi16(tci, tcoffset);
2546 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2547 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2548 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2549 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2550 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2551 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2552 fracm = _mm_srli_epi16(subtc, 1);
2553 pix1 = _mm_add_epi16(pix1,
2554 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556 pix3 = _mm_add_epi16(pix3,
2557 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2558 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2559 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2560 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2561 pix2 = _mm_add_epi16(pix2,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2563 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2564 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2568 const unsigned char * RESTRICT ptr1;
2569 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2570 tci = _mm_madd_epi16(tci, tcoffset);
2571 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2572 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574 fracm = _mm_srli_epi16(subtc, 1);
2575 pix1 = _mm_add_epi16(pix1,
2576 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2577 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2578 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2579 pix1 = _mm_add_epi16(pix1,
2580 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2581 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2582 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2586 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2588 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2590 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2591 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2592 tci = _mm_madd_epi16(tci, tcoffset);
2593 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2594 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2595 _mm_setzero_si128());
2596 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2597 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2598 _mm_setzero_si128());
2599 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2600 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2601 tci = _mm_madd_epi16(tci, tcoffset);
2602 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2603 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2604 _mm_setzero_si128());
2605 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2606 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2607 _mm_setzero_si128());
2608 fracm = _mm_srli_epi16(subtc, 1);
2609 pix1 = _mm_add_epi16(pix1,
2610 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2612 pix3 = _mm_add_epi16(pix3,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2614 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2615 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2616 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2617 pix2 = _mm_add_epi16(pix2,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2619 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2620 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2624 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2625 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2626 tci = _mm_madd_epi16(tci, tcoffset);
2627 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2628 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2629 _mm_setzero_si128());
2630 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2632 _mm_setzero_si128());
2633 fracm = _mm_srli_epi16(subtc, 1);
2634 pix1 = _mm_add_epi16(pix1,
2635 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2636 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2637 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2638 pix1 = _mm_add_epi16(pix1,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2641 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2647 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2649 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2650 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2651 tci = _mm_madd_epi16(tci, tcoffset);
2652 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2653 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2654 _mm_setzero_si128());
2655 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2656 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2657 _mm_setzero_si128());
2658 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2659 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2660 tci = _mm_madd_epi16(tci, tcoffset);
2661 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2662 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2663 _mm_setzero_si128());
2664 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2665 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2666 _mm_setzero_si128());
2667 fracm = _mm_srli_epi16(subtc, 1);
2668 pix1 = _mm_add_epi16(pix1,
2669 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2671 pix3 = _mm_add_epi16(pix3,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2673 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2674 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2675 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2676 pix2 = _mm_add_epi16(pix2,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2678 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2679 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2683 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2684 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2685 tci = _mm_madd_epi16(tci, tcoffset);
2686 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2687 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2688 _mm_setzero_si128());
2689 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2690 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2691 _mm_setzero_si128());
2692 fracm = _mm_srli_epi16(subtc, 1);
2693 pix1 = _mm_add_epi16(pix1,
2694 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2695 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2696 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2697 pix1 = _mm_add_epi16(pix1,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2700 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2707 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2709 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2711 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2713 tci = _mm_madd_epi16(tci, tcoffset);
2714 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2719 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2730 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2731 tci = _mm_and_si128(tci, tcmax);
2732 tci = _mm_madd_epi16(tci, tcoffset);
2733 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2734 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2738 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2753 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2756 float DPSOFTRAST_SampleShadowmap(const float *vector)
2762 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2765 int startx = span->startx;
2766 int endx = span->endx;
2771 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2772 for (x = startx;x < endx;x++)
2775 c[0] = (data[0] + slope[0]*x) * z;
2776 c[1] = (data[1] + slope[1]*x) * z;
2777 c[2] = (data[2] + slope[2]*x) * z;
2778 c[3] = (data[3] + slope[3]*x) * z;
2779 out4f[x*4+0] = in4f[x*4+0] * c[0];
2780 out4f[x*4+1] = in4f[x*4+1] * c[1];
2781 out4f[x*4+2] = in4f[x*4+2] * c[2];
2782 out4f[x*4+3] = in4f[x*4+3] * c[3];
2786 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2789 int startx = span->startx;
2790 int endx = span->endx;
2795 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796 for (x = startx;x < endx;x++)
2799 c[0] = (data[0] + slope[0]*x) * z;
2800 c[1] = (data[1] + slope[1]*x) * z;
2801 c[2] = (data[2] + slope[2]*x) * z;
2802 c[3] = (data[3] + slope[3]*x) * z;
2803 out4f[x*4+0] = c[0];
2804 out4f[x*4+1] = c[1];
2805 out4f[x*4+2] = c[2];
2806 out4f[x*4+3] = c[3];
2810 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2812 int x, startx = span->startx, endx = span->endx;
2813 float c[4], localcolor[4];
2814 localcolor[0] = subcolor[0];
2815 localcolor[1] = subcolor[1];
2816 localcolor[2] = subcolor[2];
2817 localcolor[3] = subcolor[3];
2818 for (x = startx;x < endx;x++)
2820 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2821 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2822 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2823 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2824 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2825 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2826 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2827 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2831 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2833 int x, startx = span->startx, endx = span->endx;
2834 for (x = startx;x < endx;x++)
2836 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2837 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2838 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2839 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2843 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2845 int x, startx = span->startx, endx = span->endx;
2846 for (x = startx;x < endx;x++)
2848 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2849 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2850 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2851 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2855 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 int x, startx = span->startx, endx = span->endx;
2859 for (x = startx;x < endx;x++)
2861 a = 1.0f - inb4f[x*4+3];
2863 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2864 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2865 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2866 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2870 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2872 int x, startx = span->startx, endx = span->endx;
2873 float localcolor[4], ilerp, lerp;
2874 localcolor[0] = color[0];
2875 localcolor[1] = color[1];
2876 localcolor[2] = color[2];
2877 localcolor[3] = color[3];
2878 ilerp = 1.0f - localcolor[3];
2879 lerp = localcolor[3];
2880 for (x = startx;x < endx;x++)
2882 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2883 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2884 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2885 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2891 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2895 int startx = span->startx;
2896 int endx = span->endx;
2899 __m128i submod, substep, endsubmod;
2900 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2901 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2902 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2903 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2904 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2905 for (x = startx; x < endx;)
2907 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2908 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2909 if (nextsub >= endx)
2911 nextsub = endsub = endx-1;
2912 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2916 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2917 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2918 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2919 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2920 substep = _mm_packs_epi32(substep, substep);
2921 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2923 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2924 pix = _mm_mulhi_epu16(pix, submod);
2925 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2929 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2930 pix = _mm_mulhi_epu16(pix, submod);
2931 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2938 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2942 int startx = span->startx;
2943 int endx = span->endx;
2946 __m128i submod, substep, endsubmod;
2947 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2948 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2949 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2950 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2951 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2952 for (x = startx; x < endx;)
2954 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2955 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2956 if (nextsub >= endx)
2958 nextsub = endsub = endx-1;
2959 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2963 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2964 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2965 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2966 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2967 substep = _mm_packs_epi32(substep, substep);
2968 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2970 __m128i pix = _mm_srai_epi16(submod, 4);
2971 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2975 __m128i pix = _mm_srai_epi16(submod, 4);
2976 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2983 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2986 int x, startx = span->startx, endx = span->endx;
2987 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2988 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2989 for (x = startx;x+2 <= endx;x+=2)
2991 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2992 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2993 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2994 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2998 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2999 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3000 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3006 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3009 int x, startx = span->startx, endx = span->endx;
3010 for (x = startx;x+2 <= endx;x+=2)
3012 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3014 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3021 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3027 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3030 int x, startx = span->startx, endx = span->endx;
3031 for (x = startx;x+2 <= endx;x+=2)
3033 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3035 pix1 = _mm_add_epi16(pix1, pix2);
3036 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3042 pix1 = _mm_add_epi16(pix1, pix2);
3043 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3048 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3051 int x, startx = span->startx, endx = span->endx;
3052 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3053 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3054 for (x = startx;x+2 <= endx;x+=2)
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3058 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3065 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3071 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3074 int x, startx = span->startx, endx = span->endx;
3075 for (x = startx;x+2 <= endx;x+=2)
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3079 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3080 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3081 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3094 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3097 int x, startx = span->startx, endx = span->endx;
3098 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3099 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3100 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3101 for (x = startx;x+2 <= endx;x+=2)
3103 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3104 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3105 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3109 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3110 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3111 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3118 void DPSOFTRAST_VertexShader_Generic(void)
3120 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3121 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3123 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3124 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3127 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3130 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3134 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3136 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3137 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3138 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3140 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3144 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3146 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3149 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3151 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3154 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3160 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3165 void DPSOFTRAST_VertexShader_PostProcess(void)
3167 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3169 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3172 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3174 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3175 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3176 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3179 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3180 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3182 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3183 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3185 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3186 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3188 // TODO: implement saturation
3190 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3192 // TODO: implement gammaramps
3194 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3199 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3201 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3204 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 // this is never called (because colormask is off when this shader is used)
3207 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3208 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3210 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 void DPSOFTRAST_VertexShader_FlatColor(void)
3218 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3222 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3224 int x, startx = span->startx, endx = span->endx;
3225 int Color_Ambienti[4];
3226 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3230 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3231 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3232 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3233 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3234 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3235 for (x = startx;x < endx;x++)
3237 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3238 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3239 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3240 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3242 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3247 void DPSOFTRAST_VertexShader_VertexColor(void)
3249 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3250 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3251 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3254 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3257 unsigned char * RESTRICT pixelmask = span->pixelmask;
3258 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3259 int x, startx = span->startx, endx = span->endx;
3260 __m128i Color_Ambientm, Color_Diffusem;
3262 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3266 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3268 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3269 pixel = buffer_FragColorbgra8;
3270 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3271 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3272 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3273 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3274 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3275 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3276 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3277 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3278 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3279 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3280 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3281 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3282 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3283 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3285 __m128i color, mod, pix;
3286 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3289 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3290 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3291 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3292 data = _mm_add_ps(data, slope);
3293 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3294 data = _mm_add_ps(data, slope);
3295 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3296 data = _mm_add_ps(data, slope);
3297 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3298 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3299 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3300 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3301 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3302 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3308 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3309 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3310 mod = _mm_packs_epi32(mod, mod);
3311 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3312 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3314 if (pixel == buffer_FragColorbgra8)
3315 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3321 void DPSOFTRAST_VertexShader_Lightmap(void)
3323 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3324 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3325 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3328 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3331 unsigned char * RESTRICT pixelmask = span->pixelmask;
3332 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3333 int x, startx = span->startx, endx = span->endx;
3334 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3335 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3336 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3340 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3341 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3342 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3343 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3344 pixel = buffer_FragColorbgra8;
3345 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3346 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3347 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3348 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3349 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3350 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3351 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3352 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3354 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3355 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3356 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3357 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3358 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3359 for (x = startx;x < endx;x++)
3361 __m128i color, lightmap, glow, pix;
3362 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3365 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3366 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3367 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3368 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3369 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3370 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3371 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3372 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3373 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3374 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3380 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3381 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3382 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3383 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3384 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3385 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3390 for (x = startx;x < endx;x++)
3392 __m128i color, lightmap, pix;
3393 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3396 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3397 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3398 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3399 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3400 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3401 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3402 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3408 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3409 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3411 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3414 if (pixel == buffer_FragColorbgra8)
3415 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3421 void DPSOFTRAST_VertexShader_FakeLight(void)
3423 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3426 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3429 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3430 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3432 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3433 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3438 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3440 DPSOFTRAST_VertexShader_Lightmap();
3443 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3445 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3451 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3453 DPSOFTRAST_VertexShader_Lightmap();
3456 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3458 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3464 void DPSOFTRAST_VertexShader_LightDirection(void)
3467 int numvertices = dpsoftrast.numvertices;
3469 float LightVector[4];
3470 float EyePosition[4];
3471 float EyeVectorModelSpace[4];
3477 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3478 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3479 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3480 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3481 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3482 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3483 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3484 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3485 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3486 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3487 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3488 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3489 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3490 for (i = 0;i < numvertices;i++)
3492 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3493 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3494 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3495 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3496 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3497 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3498 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3499 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3500 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3501 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3502 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3503 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3504 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3505 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3506 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3507 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3508 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3509 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3510 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3511 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3512 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3513 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3514 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3515 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3516 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3517 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3518 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3519 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3520 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3522 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3525 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3526 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3527 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3528 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3529 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3530 #define DPSOFTRAST_Vector3Normalize(v)\
3533 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3544 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3546 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3547 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3551 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3552 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3553 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3554 int x, startx = span->startx, endx = span->endx;
3555 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3556 float LightVectordata[4];
3557 float LightVectorslope[4];
3558 float EyeVectordata[4];
3559 float EyeVectorslope[4];
3561 float diffusetex[4];
3563 float surfacenormal[4];
3564 float lightnormal[4];
3566 float specularnormal[4];
3569 float SpecularPower;
3571 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3572 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3573 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3574 Color_Glow[3] = 0.0f;
3575 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3576 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3577 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3578 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3579 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3580 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3581 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3582 Color_Pants[3] = 0.0f;
3583 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3584 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3585 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3586 Color_Shirt[3] = 0.0f;
3587 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3588 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3589 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3591 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3592 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3594 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3596 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3598 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3600 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3601 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3602 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3603 Color_Diffuse[3] = 0.0f;
3604 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3605 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3606 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3607 LightColor[3] = 0.0f;
3608 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3609 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3610 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3611 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3612 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3613 Color_Specular[3] = 0.0f;
3614 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3615 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3616 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3617 for (x = startx;x < endx;x++)
3620 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3621 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3622 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3623 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3624 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3626 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3627 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3628 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3629 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3631 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3632 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3633 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3634 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3635 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3636 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3637 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3638 DPSOFTRAST_Vector3Normalize(surfacenormal);
3640 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3641 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3642 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3643 DPSOFTRAST_Vector3Normalize(lightnormal);
3645 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3646 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3647 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3648 DPSOFTRAST_Vector3Normalize(eyenormal);
3650 specularnormal[0] = lightnormal[0] + eyenormal[0];
3651 specularnormal[1] = lightnormal[1] + eyenormal[1];
3652 specularnormal[2] = lightnormal[2] + eyenormal[2];
3653 DPSOFTRAST_Vector3Normalize(specularnormal);
3655 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3656 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3657 specular = pow(specular, SpecularPower * glosstex[3]);
3658 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3660 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3661 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3662 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3663 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3667 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3668 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3669 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3670 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3672 buffer_FragColorbgra8[x*4+0] = d[0];
3673 buffer_FragColorbgra8[x*4+1] = d[1];
3674 buffer_FragColorbgra8[x*4+2] = d[2];
3675 buffer_FragColorbgra8[x*4+3] = d[3];
3678 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3680 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3681 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3682 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3683 Color_Diffuse[3] = 0.0f;
3684 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3685 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3686 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3687 LightColor[3] = 0.0f;
3688 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3689 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3690 for (x = startx;x < endx;x++)
3693 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3694 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3695 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3696 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3697 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3698 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3699 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3700 DPSOFTRAST_Vector3Normalize(surfacenormal);
3702 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3703 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3704 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3705 DPSOFTRAST_Vector3Normalize(lightnormal);
3707 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3708 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3710 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3711 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3712 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3713 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3717 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3718 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3719 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3720 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3722 buffer_FragColorbgra8[x*4+0] = d[0];
3723 buffer_FragColorbgra8[x*4+1] = d[1];
3724 buffer_FragColorbgra8[x*4+2] = d[2];
3725 buffer_FragColorbgra8[x*4+3] = d[3];
3730 for (x = startx;x < endx;x++)
3733 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3734 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3735 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3736 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3738 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3740 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3741 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3742 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3743 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3747 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3748 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3749 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3750 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3752 buffer_FragColorbgra8[x*4+0] = d[0];
3753 buffer_FragColorbgra8[x*4+1] = d[1];
3754 buffer_FragColorbgra8[x*4+2] = d[2];
3755 buffer_FragColorbgra8[x*4+3] = d[3];
3758 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3763 void DPSOFTRAST_VertexShader_LightSource(void)
3766 int numvertices = dpsoftrast.numvertices;
3767 float LightPosition[4];
3768 float LightVector[4];
3769 float LightVectorModelSpace[4];
3770 float EyePosition[4];
3771 float EyeVectorModelSpace[4];
3777 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3778 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3779 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3780 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3781 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3782 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3783 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3784 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3785 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3786 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3787 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3788 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3789 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3790 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3791 for (i = 0;i < numvertices;i++)
3793 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3794 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3795 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3796 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3797 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3798 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3799 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3800 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3801 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3802 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3803 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3804 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3805 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3806 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3807 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3808 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3809 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3810 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3811 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3812 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3813 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3814 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3815 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3816 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3817 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3818 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3819 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3820 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3821 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3822 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3823 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3824 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3826 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3827 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3830 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3833 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3834 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3835 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3836 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3837 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3838 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3839 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3840 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3841 int x, startx = span->startx, endx = span->endx;
3842 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3843 float CubeVectordata[4];
3844 float CubeVectorslope[4];
3845 float LightVectordata[4];
3846 float LightVectorslope[4];
3847 float EyeVectordata[4];
3848 float EyeVectorslope[4];
3850 float diffusetex[4];
3852 float surfacenormal[4];
3853 float lightnormal[4];
3855 float specularnormal[4];
3858 float SpecularPower;
3859 float CubeVector[4];
3862 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3863 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3864 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3865 Color_Glow[3] = 0.0f;
3866 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3867 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3868 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3869 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3870 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3871 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3872 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3873 Color_Diffuse[3] = 0.0f;
3874 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3875 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3876 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3877 Color_Specular[3] = 0.0f;
3878 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3879 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3880 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3881 Color_Pants[3] = 0.0f;
3882 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3883 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3884 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3885 Color_Shirt[3] = 0.0f;
3886 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3887 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3888 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3889 LightColor[3] = 0.0f;
3890 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3891 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3892 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3893 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3894 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3895 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3896 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3897 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3899 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3900 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3902 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3903 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3904 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3906 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3907 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3908 for (x = startx;x < endx;x++)
3911 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3912 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3913 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3914 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3915 if (attenuation < 0.01f)
3917 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3919 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3920 if (attenuation < 0.01f)
3924 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3925 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3926 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3927 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3928 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3930 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3931 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3932 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3933 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3935 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3936 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3937 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3938 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3939 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3940 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3941 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3942 DPSOFTRAST_Vector3Normalize(surfacenormal);
3944 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3945 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3946 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3947 DPSOFTRAST_Vector3Normalize(lightnormal);
3949 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3950 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3951 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3952 DPSOFTRAST_Vector3Normalize(eyenormal);
3954 specularnormal[0] = lightnormal[0] + eyenormal[0];
3955 specularnormal[1] = lightnormal[1] + eyenormal[1];
3956 specularnormal[2] = lightnormal[2] + eyenormal[2];
3957 DPSOFTRAST_Vector3Normalize(specularnormal);
3959 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3960 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3961 specular = pow(specular, SpecularPower * glosstex[3]);
3962 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3964 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3965 attenuation *= (1.0f / 255.0f);
3966 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3967 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3968 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3969 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3973 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3974 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3975 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3976 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3978 buffer_FragColorbgra8[x*4+0] = d[0];
3979 buffer_FragColorbgra8[x*4+1] = d[1];
3980 buffer_FragColorbgra8[x*4+2] = d[2];
3981 buffer_FragColorbgra8[x*4+3] = d[3];
3984 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3986 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3987 for (x = startx;x < endx;x++)
3990 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3991 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3992 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3993 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3994 if (attenuation < 0.01f)
3996 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3998 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3999 if (attenuation < 0.01f)
4003 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4004 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4005 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4006 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4007 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4009 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4010 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4011 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4012 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4014 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4015 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4016 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4017 DPSOFTRAST_Vector3Normalize(surfacenormal);
4019 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4020 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4021 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4022 DPSOFTRAST_Vector3Normalize(lightnormal);
4024 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4025 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4027 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4028 attenuation *= (1.0f / 255.0f);
4029 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4030 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4031 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4032 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4036 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4037 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4038 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4039 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4041 buffer_FragColorbgra8[x*4+0] = d[0];
4042 buffer_FragColorbgra8[x*4+1] = d[1];
4043 buffer_FragColorbgra8[x*4+2] = d[2];
4044 buffer_FragColorbgra8[x*4+3] = d[3];
4049 for (x = startx;x < endx;x++)
4052 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4053 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4054 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4055 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4056 if (attenuation < 0.01f)
4058 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4060 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4061 if (attenuation < 0.01f)
4065 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4066 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4067 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4068 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4069 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4071 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4072 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4073 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4074 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4076 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4078 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4079 attenuation *= (1.0f / 255.0f);
4080 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4081 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4082 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4083 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4087 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4088 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4089 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4090 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4092 buffer_FragColorbgra8[x*4+0] = d[0];
4093 buffer_FragColorbgra8[x*4+1] = d[1];
4094 buffer_FragColorbgra8[x*4+2] = d[2];
4095 buffer_FragColorbgra8[x*4+3] = d[3];
4098 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4104 void DPSOFTRAST_VertexShader_Refraction(void)
4106 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4109 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4112 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4113 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4114 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4115 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4116 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4121 void DPSOFTRAST_VertexShader_Water(void)
4123 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4127 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4130 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4131 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4132 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4133 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4134 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4139 void DPSOFTRAST_VertexShader_ShowDepth(void)
4141 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4147 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4148 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4150 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4151 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4156 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4158 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4161 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4164 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4165 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4166 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4167 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4173 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4178 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4181 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4182 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4183 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4184 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4185 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4190 typedef struct DPSOFTRAST_ShaderModeInfo_s
4193 void (*Vertex)(void);
4194 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4195 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4196 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4198 DPSOFTRAST_ShaderModeInfo;
4200 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4202 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4203 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4204 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4205 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4206 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4207 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4208 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4209 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4210 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4211 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4212 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4213 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4214 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4215 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4216 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4217 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4220 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4227 // unsigned int *colorpixel;
4228 unsigned int *depthpixel;
4234 DPSOFTRAST_State_Triangle *triangle;
4235 DPSOFTRAST_State_Span *span;
4236 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4237 for (i = 0; i < thread->numspans; i++)
4239 span = &thread->spans[i];
4240 triangle = &thread->triangles[span->triangle];
4241 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4243 wslope = triangle->w[0];
4244 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4245 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4246 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4247 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4248 startx = span->startx;
4250 switch(thread->fb_depthfunc)
4253 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4254 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4255 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4256 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4257 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4258 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4259 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4261 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4262 //for (x = startx;x < endx;x++)
4263 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4264 // if there is no color buffer, skip pixel shader
4265 while (startx < endx && !pixelmask[startx])
4267 while (endx > startx && !pixelmask[endx-1])
4270 continue; // no pixels to fill
4271 span->pixelmask = pixelmask;
4272 span->startx = startx;
4274 // run pixel shader if appropriate
4275 // do this before running depthmask code, to allow the pixelshader
4276 // to clear pixelmask values for alpha testing
4277 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4278 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4279 if (thread->depthmask)
4280 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4286 // no depth testing means we're just dealing with color...
4287 // if there is no color buffer, skip pixel shader
4288 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4290 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4291 span->pixelmask = pixelmask;
4292 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4296 thread->numspans = 0;
4299 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4301 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4304 int cullface = thread->cullface;
4305 int minx, maxx, miny, maxy;
4306 int miny1, maxy1, miny2, maxy2;
4307 __m128i fbmin, fbmax;
4308 __m128 viewportcenter, viewportscale;
4309 int firstvertex = command->firstvertex;
4310 int numvertices = command->numvertices;
4311 int numtriangles = command->numtriangles;
4312 const int *element3i = command->element3i;
4313 const unsigned short *element3s = command->element3s;
4314 int clipped = command->clipped;
4321 int starty, endy, bandy;
4325 __m128 triangleedge1, triangleedge2, trianglenormal;
4328 DPSOFTRAST_State_Triangle *triangle;
4329 DPSOFTRAST_Texture *texture;
4330 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4331 miny = thread->fb_scissor[1];
4332 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4333 miny1 = bound(miny, thread->miny1, maxy);
4334 maxy1 = bound(miny, thread->maxy1, maxy);
4335 miny2 = bound(miny, thread->miny2, maxy);
4336 maxy2 = bound(miny, thread->maxy2, maxy);
4337 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4339 if (!ATOMIC_DECREMENT(command->refcount))
4341 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4342 MM_FREE(command->arrays);
4346 minx = thread->fb_scissor[0];
4347 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4348 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4349 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4350 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4351 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4352 screen[3] = _mm_setzero_ps();
4353 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4354 for (i = 0;i < numtriangles;i++)
4356 const float *screencoord4f = command->arrays;
4357 const float *arrays = screencoord4f + numvertices*4;
4359 // generate the 3 edges of this triangle
4360 // generate spans for the triangle - switch based on left split or right split classification of triangle
4363 e[0] = element3s[i*3+0] - firstvertex;
4364 e[1] = element3s[i*3+1] - firstvertex;
4365 e[2] = element3s[i*3+2] - firstvertex;
4369 e[0] = element3i[i*3+0] - firstvertex;
4370 e[1] = element3i[i*3+1] - firstvertex;
4371 e[2] = element3i[i*3+2] - firstvertex;
4380 #define SKIPBACKFACE \
4381 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4382 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4383 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4384 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4385 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4389 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4393 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4398 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4399 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4401 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4402 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4404 #define CLIPPEDVERTEXCOPY(k,p1) \
4405 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4407 #define GENATTRIBCOPY(attrib, p1) \
4408 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4409 #define GENATTRIBLERP(attrib, p1, p2) \
4411 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4412 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4414 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4418 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4419 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4420 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4421 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4422 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4423 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4424 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4430 // calculate distance from nearplane
4431 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4432 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4433 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4434 if (clipdist[0] >= 0.0f)
4436 if (clipdist[1] >= 0.0f)
4438 if (clipdist[2] >= 0.0f)
4441 // triangle is entirely in front of nearplane
4442 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4449 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4457 if (clipdist[2] >= 0.0f)
4459 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4466 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4473 else if (clipdist[1] >= 0.0f)
4475 if (clipdist[2] >= 0.0f)
4477 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4484 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4490 else if (clipdist[2] >= 0.0f)
4492 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4497 else continue; // triangle is entirely behind nearplane
4500 // calculate integer y coords for triangle points
4501 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4502 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4503 screenmin = _mm_min_epi16(screeni, screenir),
4504 screenmax = _mm_max_epi16(screeni, screenir);
4505 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4506 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4507 screenmin = _mm_max_epi16(screenmin, fbmin);
4508 screenmax = _mm_min_epi16(screenmax, fbmax);
4509 // skip offscreen triangles
4510 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4512 starty = _mm_extract_epi16(screenmin, 1);
4513 endy = _mm_extract_epi16(screenmax, 1)+1;
4514 if (starty >= maxy1 && endy <= miny2)
4516 screeny = _mm_srai_epi32(screeni, 16);
4519 triangle = &thread->triangles[thread->numtriangles];
4521 // calculate attribute plans for triangle data...
4522 // okay, this triangle is going to produce spans, we'd better project
4523 // the interpolants now (this is what gives perspective texturing),
4524 // this consists of simply multiplying all arrays by the W coord
4525 // (which is basically 1/Z), which will be undone per-pixel
4526 // (multiplying by Z again) to get the perspective-correct array
4529 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4530 __m128 mipedgescale, mipdensity;
4531 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4532 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4533 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4534 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4535 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4536 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4537 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4538 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4539 attribedge1 = _mm_sub_ss(w0, w1);
4540 attribedge2 = _mm_sub_ss(w2, w1);
4541 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4542 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4543 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4544 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4545 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4546 _mm_store_ss(&triangle->w[0], attribxslope);
4547 _mm_store_ss(&triangle->w[1], attribyslope);
4548 _mm_store_ss(&triangle->w[2], attriborigin);
4549 mipedgescale = _mm_setzero_ps();
4550 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4552 __m128 attrib0, attrib1, attrib2;
4553 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4554 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4556 arrays += numvertices*4;
4557 GENATTRIBS(attrib0, attrib1, attrib2);
4558 attriborigin = _mm_mul_ps(attrib1, w1);
4559 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4560 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4561 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4562 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4563 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4564 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4565 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4566 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4567 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4569 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4570 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4571 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4572 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4576 memset(triangle->mip, 0, sizeof(triangle->mip));
4577 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4579 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4580 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4582 texture = thread->texbound[texunit];
4583 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4585 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4586 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4587 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4588 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4589 // this will be multiplied in the texturing routine by the texture resolution
4590 y = _mm_cvtss_si32(mipdensity);
4593 y = (int)(log((float)y)*0.5f/M_LN2);
4594 if (y > texture->mipmaps - 1)
4595 y = texture->mipmaps - 1;
4596 triangle->mip[texunit] = y;
4602 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4605 __m128 xcoords, xslope;
4606 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4607 int yccmask = _mm_movemask_epi8(ycc);
4608 int edge0p, edge0n, edge1p, edge1n;
4615 case 0xFFFF: /*0000*/ y = endy; continue;
4616 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4617 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4618 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4619 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4620 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4621 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4622 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4623 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4624 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4625 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4626 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4627 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4628 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4629 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4630 case 0x0000: /*1111*/ y++; continue;
4638 case 0xFFFF: /*000*/ y = endy; continue;
4639 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4640 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4641 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4642 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4643 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4644 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4645 case 0x0000: /*111*/ y++; continue;
4648 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4649 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4650 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4651 nexty = _mm_extract_epi16(ycc, 0);
4652 if (nexty >= bandy) nexty = bandy-1;
4653 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4662 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4663 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4664 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4665 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4666 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4667 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4669 int startx, endx, offset;
4670 startx = _mm_cvtss_si32(xcoords);
4671 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4674 if (startx < 0) startx = 0;
4675 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4677 if (endx > maxx) endx = maxx;
4678 if (startx >= endx) continue;
4679 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4681 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4682 span->triangle = thread->numtriangles;
4685 span->startx = max(minx - offset, 0);
4686 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4687 if (span->startx >= span->endx)
4689 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4690 DPSOFTRAST_Draw_ProcessSpans(thread);
4695 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4697 DPSOFTRAST_Draw_ProcessSpans(thread);
4698 thread->numtriangles = 0;
4702 if (!ATOMIC_DECREMENT(command->refcount))
4704 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4705 MM_FREE(command->arrays);
4708 if (thread->numspans > 0 || thread->numtriangles > 0)
4710 DPSOFTRAST_Draw_ProcessSpans(thread);
4711 thread->numtriangles = 0;
4716 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4720 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4721 int datasize = 2*numvertices*sizeof(float[4]);
4722 DPSOFTRAST_Command_Draw *command;
4723 unsigned char *data;
4724 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4726 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4727 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4729 datasize += numvertices*sizeof(float[4]);
4732 datasize += numtriangles*sizeof(unsigned short[3]);
4734 datasize += numtriangles*sizeof(int[3]);
4735 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4736 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4738 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4739 data = (unsigned char *)MM_CALLOC(datasize, 1);
4743 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4744 data = (unsigned char *)command + commandsize;
4746 command->firstvertex = firstvertex;
4747 command->numvertices = numvertices;
4748 command->numtriangles = numtriangles;
4749 command->arrays = (float *)data;
4750 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4751 dpsoftrast.firstvertex = firstvertex;
4752 dpsoftrast.numvertices = numvertices;
4753 dpsoftrast.screencoord4f = (float *)data;
4754 data += numvertices*sizeof(float[4]);
4755 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4756 data += numvertices*sizeof(float[4]);
4757 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4759 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4760 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4762 dpsoftrast.post_array4f[j] = (float *)data;
4763 data += numvertices*sizeof(float[4]);
4765 command->element3i = NULL;
4766 command->element3s = NULL;
4769 command->element3s = (unsigned short *)data;
4770 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4774 command->element3i = (int *)data;
4775 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4780 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4782 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4783 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4784 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4785 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4786 if (command->starty >= command->endy)
4788 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4789 MM_FREE(command->arrays);
4790 DPSOFTRAST_UndoCommand(command->commandsize);
4793 command->clipped = dpsoftrast.drawclipped;
4794 command->refcount = dpsoftrast.numthreads;
4797 DPSOFTRAST_Draw_SyncCommands();
4800 for (i = 0; i < dpsoftrast.numthreads; i++)
4802 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4803 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4804 SDL_CondSignal(thread->drawcond);
4808 DPSOFTRAST_Draw_FlushThreads();
4812 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4814 int commandoffset = thread->commandoffset;
4815 while (commandoffset != endoffset)
4817 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4818 switch (command->opcode)
4820 #define INTERPCOMMAND(name) \
4821 case DPSOFTRAST_OPCODE_##name : \
4822 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4823 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4824 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4825 commandoffset = 0; \
4827 INTERPCOMMAND(Viewport)
4828 INTERPCOMMAND(ClearColor)
4829 INTERPCOMMAND(ClearDepth)
4830 INTERPCOMMAND(ColorMask)
4831 INTERPCOMMAND(DepthTest)
4832 INTERPCOMMAND(ScissorTest)
4833 INTERPCOMMAND(Scissor)
4834 INTERPCOMMAND(BlendFunc)
4835 INTERPCOMMAND(BlendSubtract)
4836 INTERPCOMMAND(DepthMask)
4837 INTERPCOMMAND(DepthFunc)
4838 INTERPCOMMAND(DepthRange)
4839 INTERPCOMMAND(PolygonOffset)
4840 INTERPCOMMAND(CullFace)
4841 INTERPCOMMAND(AlphaTest)
4842 INTERPCOMMAND(AlphaFunc)
4843 INTERPCOMMAND(SetTexture)
4844 INTERPCOMMAND(SetShader)
4845 INTERPCOMMAND(Uniform4f)
4846 INTERPCOMMAND(UniformMatrix4f)
4847 INTERPCOMMAND(Uniform1i)
4849 case DPSOFTRAST_OPCODE_Draw:
4850 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4851 commandoffset += command->commandsize;
4852 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4854 thread->commandoffset = commandoffset;
4857 case DPSOFTRAST_OPCODE_Reset:
4862 thread->commandoffset = commandoffset;
4866 static int DPSOFTRAST_Draw_Thread(void *data)
4868 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4869 while(thread->index >= 0)
4871 if (thread->commandoffset != dpsoftrast.drawcommand)
4873 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4877 SDL_LockMutex(thread->drawmutex);
4878 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4880 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4881 thread->starving = true;
4882 SDL_CondWait(thread->drawcond, thread->drawmutex);
4883 thread->starving = false;
4885 SDL_UnlockMutex(thread->drawmutex);
4892 static void DPSOFTRAST_Draw_FlushThreads(void)
4894 DPSOFTRAST_State_Thread *thread;
4896 DPSOFTRAST_Draw_SyncCommands();
4898 for (i = 0; i < dpsoftrast.numthreads; i++)
4900 thread = &dpsoftrast.threads[i];
4901 if (thread->commandoffset != dpsoftrast.drawcommand)
4903 SDL_LockMutex(thread->drawmutex);
4904 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4905 SDL_CondSignal(thread->drawcond);
4906 SDL_UnlockMutex(thread->drawmutex);
4910 for (i = 0; i < dpsoftrast.numthreads; i++)
4912 thread = &dpsoftrast.threads[i];
4914 if (thread->commandoffset != dpsoftrast.drawcommand)
4916 SDL_LockMutex(thread->drawmutex);
4917 if (thread->commandoffset != dpsoftrast.drawcommand)
4919 thread->waiting = true;
4920 SDL_CondWait(thread->waitcond, thread->drawmutex);
4921 thread->waiting = false;
4923 SDL_UnlockMutex(thread->drawmutex);
4926 if (thread->commandoffset != dpsoftrast.drawcommand)
4927 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4930 dpsoftrast.commandpool.usedcommands = 0;
4933 void DPSOFTRAST_Flush(void)
4935 DPSOFTRAST_Draw_FlushThreads();
4938 void DPSOFTRAST_Finish(void)
4943 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4953 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4954 dpsoftrast.bigendian = u.b[3];
4955 dpsoftrast.fb_width = width;
4956 dpsoftrast.fb_height = height;
4957 dpsoftrast.fb_depthpixels = depthpixels;
4958 dpsoftrast.fb_colorpixels[0] = colorpixels;
4959 dpsoftrast.fb_colorpixels[1] = NULL;
4960 dpsoftrast.fb_colorpixels[1] = NULL;
4961 dpsoftrast.fb_colorpixels[1] = NULL;
4962 dpsoftrast.viewport[0] = 0;
4963 dpsoftrast.viewport[1] = 0;
4964 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4965 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4966 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4967 dpsoftrast.texture_firstfree = 1;
4968 dpsoftrast.texture_end = 1;
4969 dpsoftrast.texture_max = 0;
4970 dpsoftrast.color[0] = 1;
4971 dpsoftrast.color[1] = 1;
4972 dpsoftrast.color[2] = 1;
4973 dpsoftrast.color[3] = 1;
4974 dpsoftrast.interlace = bound(0, interlace, 1);
4976 dpsoftrast.numthreads = bound(1, numthreads, 64);
4978 dpsoftrast.numthreads = 1;
4980 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4981 for (i = 0; i < dpsoftrast.numthreads; i++)
4983 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4985 thread->cullface = GL_BACK;
4986 thread->colormask[1] = 1;
4987 thread->colormask[2] = 1;
4988 thread->colormask[3] = 1;
4989 thread->blendfunc[0] = GL_ONE;
4990 thread->blendfunc[1] = GL_ZERO;
4991 thread->depthmask = true;
4992 thread->depthtest = true;
4993 thread->depthfunc = GL_LEQUAL;
4994 thread->scissortest = false;
4995 thread->alphatest = false;
4996 thread->alphafunc = GL_GREATER;
4997 thread->alphavalue = 0.5f;
4998 thread->viewport[0] = 0;
4999 thread->viewport[1] = 0;
5000 thread->viewport[2] = dpsoftrast.fb_width;
5001 thread->viewport[3] = dpsoftrast.fb_height;
5002 thread->scissor[0] = 0;
5003 thread->scissor[1] = 0;
5004 thread->scissor[2] = dpsoftrast.fb_width;
5005 thread->scissor[3] = dpsoftrast.fb_height;
5006 thread->depthrange[0] = 0;
5007 thread->depthrange[1] = 1;
5008 thread->polygonoffset[0] = 0;
5009 thread->polygonoffset[1] = 0;
5011 if (dpsoftrast.interlace)
5013 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5014 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5015 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5016 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5020 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5021 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5024 thread->numspans = 0;
5025 thread->numtriangles = 0;
5026 thread->commandoffset = 0;
5027 thread->waiting = false;
5028 thread->starving = false;
5030 thread->waitcond = SDL_CreateCond();
5031 thread->drawcond = SDL_CreateCond();
5032 thread->drawmutex = SDL_CreateMutex();
5035 thread->validate = -1;
5036 DPSOFTRAST_Validate(thread, -1);
5038 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5043 void DPSOFTRAST_Shutdown(void)
5047 if (dpsoftrast.numthreads > 0)
5049 DPSOFTRAST_State_Thread *thread;
5050 for (i = 0; i < dpsoftrast.numthreads; i++)
5052 thread = &dpsoftrast.threads[i];
5053 SDL_LockMutex(thread->drawmutex);
5055 SDL_CondSignal(thread->drawcond);
5056 SDL_UnlockMutex(thread->drawmutex);
5057 SDL_WaitThread(thread->thread, NULL);
5058 SDL_DestroyCond(thread->waitcond);
5059 SDL_DestroyCond(thread->drawcond);
5060 SDL_DestroyMutex(thread->drawmutex);
5064 for (i = 0;i < dpsoftrast.texture_end;i++)
5065 if (dpsoftrast.texture[i].bytes)
5066 MM_FREE(dpsoftrast.texture[i].bytes);
5067 if (dpsoftrast.texture)
5068 free(dpsoftrast.texture);
5069 if (dpsoftrast.threads)
5070 MM_FREE(dpsoftrast.threads);
5071 memset(&dpsoftrast, 0, sizeof(dpsoftrast));