3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if (ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int startx; // usable range (according to pixelmask)
183 int endx; // usable range (according to pixelmask)
184 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 DPSOFTRAST_State_Span);
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196 typedef enum DPSOFTRAST_BLENDMODE_e
198 DPSOFTRAST_BLENDMODE_OPAQUE,
199 DPSOFTRAST_BLENDMODE_ALPHA,
200 DPSOFTRAST_BLENDMODE_ADDALPHA,
201 DPSOFTRAST_BLENDMODE_ADD,
202 DPSOFTRAST_BLENDMODE_INVMOD,
203 DPSOFTRAST_BLENDMODE_MUL,
204 DPSOFTRAST_BLENDMODE_MUL2,
205 DPSOFTRAST_BLENDMODE_SUBALPHA,
206 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207 DPSOFTRAST_BLENDMODE_INVADD,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262 ATOMIC(volatile int commandoffset);
264 volatile bool waiting;
265 volatile bool starving;
268 SDL_mutex *drawmutex;
272 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
275 DPSOFTRAST_State_Thread);
277 typedef ATOMIC(struct DPSOFTRAST_State_s
281 unsigned int *fb_depthpixels;
282 unsigned int *fb_colorpixels[4];
285 ALIGN(float fb_viewportcenter[4]);
286 ALIGN(float fb_viewportscale[4]);
289 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
292 const float *pointer_vertex3f;
293 const float *pointer_color4f;
294 const unsigned char *pointer_color4ub;
295 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305 float *screencoord4f;
311 int shader_permutation;
315 int texture_firstfree;
316 DPSOFTRAST_Texture *texture;
321 const char *errorstring;
325 DPSOFTRAST_State_Thread *threads;
327 ATOMIC(volatile int drawcommand);
329 DPSOFTRAST_State_Command_Pool commandpool;
333 DPSOFTRAST_State dpsoftrast;
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
343 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345 fb_viewportcenter[3] = 0.5f;
346 fb_viewportcenter[0] = 0.0f;
347 fb_viewportscale[1] = 0.5f * viewport[2];
348 fb_viewportscale[2] = -0.5f * viewport[3];
349 fb_viewportscale[3] = 0.5f;
350 fb_viewportscale[0] = 1.0f;
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
355 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356 // and viewport projection values
359 x1 = thread->scissor[0];
360 x2 = thread->scissor[0] + thread->scissor[2];
361 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362 y2 = dpsoftrast.fb_height - thread->scissor[1];
363 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
365 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
367 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368 thread->fb_scissor[0] = x1;
369 thread->fb_scissor[1] = y1;
370 thread->fb_scissor[2] = x2 - x1;
371 thread->fb_scissor[3] = y2 - y1;
373 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
378 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
383 if (thread->blendsubtract)
385 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
387 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
395 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
397 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
407 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
416 mask &= thread->validate;
419 if (mask & DPSOFTRAST_VALIDATE_FB)
421 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422 DPSOFTRAST_RecalcFB(thread);
424 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427 DPSOFTRAST_RecalcDepthFunc(thread);
429 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432 DPSOFTRAST_RecalcBlendFunc(thread);
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
438 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439 return &dpsoftrast.texture[index];
443 static void DPSOFTRAST_Texture_Grow(void)
445 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446 DPSOFTRAST_State_Thread *thread;
450 // expand texture array as needed
451 if (dpsoftrast.texture_max < 1024)
452 dpsoftrast.texture_max = 1024;
454 dpsoftrast.texture_max *= 2;
455 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (dpsoftrast.texbound[i])
458 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459 for (j = 0; j < dpsoftrast.numthreads; j++)
461 thread = &dpsoftrast.threads[j];
462 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463 if (thread->texbound[i])
464 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
477 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479 DPSOFTRAST_Texture *texture;
480 if (width*height*depth < 1)
482 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
485 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
492 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
499 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
514 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
516 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
534 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
539 // find first empty slot in texture array
540 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541 if (!dpsoftrast.texture[texnum].bytes)
543 dpsoftrast.texture_firstfree = texnum + 1;
544 if (dpsoftrast.texture_max <= texnum)
545 DPSOFTRAST_Texture_Grow();
546 if (dpsoftrast.texture_end <= texnum)
547 dpsoftrast.texture_end = texnum + 1;
548 texture = &dpsoftrast.texture[texnum];
549 memset(texture, 0, sizeof(*texture));
550 texture->flags = flags;
551 texture->width = width;
552 texture->height = height;
553 texture->depth = depth;
554 texture->sides = sides;
566 s = w * h * d * sides * 4;
567 texture->mipmap[mipmaps][0] = size;
568 texture->mipmap[mipmaps][1] = s;
569 texture->mipmap[mipmaps][2] = w;
570 texture->mipmap[mipmaps][3] = h;
571 texture->mipmap[mipmaps][4] = d;
574 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 texture->mipmaps = mipmaps;
581 texture->size = size;
583 // allocate the pixels now
584 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588 void DPSOFTRAST_Texture_Free(int index)
590 DPSOFTRAST_Texture *texture;
591 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595 MM_FREE(texture->bytes);
596 texture->bytes = NULL;
597 memset(texture, 0, sizeof(*texture));
598 // adjust the free range and used range
599 if (dpsoftrast.texture_firstfree > index)
600 dpsoftrast.texture_firstfree = index;
601 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602 dpsoftrast.texture_end--;
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
606 int i, x, y, z, w, layer0, layer1, row0, row1;
607 unsigned char *o, *i0, *i1, *i2, *i3;
608 DPSOFTRAST_Texture *texture;
609 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610 if (texture->mipmaps <= 1)
612 for (i = 1;i < texture->mipmaps;i++)
614 for (z = 0;z < texture->mipmap[i][4];z++)
618 if (layer1 >= texture->mipmap[i-1][4])
619 layer1 = texture->mipmap[i-1][4]-1;
620 for (y = 0;y < texture->mipmap[i][3];y++)
624 if (row1 >= texture->mipmap[i-1][3])
625 row1 = texture->mipmap[i-1][3]-1;
626 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
627 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631 w = texture->mipmap[i][2];
634 if (texture->mipmap[i-1][2] > 1)
636 // average 3D texture
637 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
639 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
647 // average 3D mipmap with parent width == 1
648 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
650 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659 if (texture->mipmap[i-1][2] > 1)
661 // average 2D texture (common case)
662 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
664 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
672 // 2D texture with parent width == 1
673 o[0] = (i0[0] + i1[0] + 1) >> 1;
674 o[1] = (i0[1] + i1[1] + 1) >> 1;
675 o[2] = (i0[2] + i1[2] + 1) >> 1;
676 o[3] = (i0[3] + i1[3] + 1) >> 1;
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
685 DPSOFTRAST_Texture *texture;
687 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691 while (blockheight > 0)
693 memcpy(dst, pixels, blockwidth * 4);
694 pixels += blockwidth * 4;
695 dst += texture->mipmap[0][2] * 4;
698 DPSOFTRAST_Texture_CalculateMipmaps(index);
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
702 DPSOFTRAST_Texture *texture;
703 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
706 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707 DPSOFTRAST_Texture_CalculateMipmaps(index);
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][2];
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][3];
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725 return texture->mipmap[mip][4];
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
729 DPSOFTRAST_Texture *texture;
730 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
733 return texture->bytes + texture->mipmap[mip][0];
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
741 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
746 texture->filter = filter;
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
751 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
755 dpsoftrast.fb_width = width;
756 dpsoftrast.fb_height = height;
757 dpsoftrast.fb_depthpixels = depthpixels;
758 dpsoftrast.fb_colorpixels[0] = colorpixels0;
759 dpsoftrast.fb_colorpixels[1] = colorpixels1;
760 dpsoftrast.fb_colorpixels[2] = colorpixels2;
761 dpsoftrast.fb_colorpixels[3] = colorpixels3;
764 static void DPSOFTRAST_Draw_FlushThreads(void);
766 static void DPSOFTRAST_Draw_SyncCommands(void)
769 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
775 DPSOFTRAST_State_Thread *thread;
777 int freecommand = dpsoftrast.commandpool.freecommand;
778 int usedcommands = dpsoftrast.commandpool.usedcommands;
779 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
781 DPSOFTRAST_Draw_SyncCommands();
787 for (i = 0; i < dpsoftrast.numthreads; i++)
789 thread = &dpsoftrast.threads[i];
790 commandoffset = freecommand - thread->commandoffset;
791 if (commandoffset < 0)
792 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793 if (commandoffset > usedcommands)
796 usedcommands = commandoffset;
799 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
801 thread = &dpsoftrast.threads[waitindex];
802 SDL_LockMutex(thread->drawmutex);
803 if (thread->commandoffset != dpsoftrast.drawcommand)
805 thread->waiting = true;
806 if (thread->starving) SDL_CondSignal(thread->drawcond);
807 SDL_CondWait(thread->waitcond, thread->drawmutex);
808 thread->waiting = false;
810 SDL_UnlockMutex(thread->drawmutex);
812 dpsoftrast.commandpool.usedcommands = usedcommands;
814 DPSOFTRAST_Draw_FlushThreads();
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
825 DPSOFTRAST_Command *command;
826 int freecommand = dpsoftrast.commandpool.freecommand;
827 int usedcommands = dpsoftrast.commandpool.usedcommands;
828 int extra = sizeof(DPSOFTRAST_Command);
829 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834 freecommand = dpsoftrast.commandpool.freecommand;
835 usedcommands = dpsoftrast.commandpool.usedcommands;
837 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
839 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840 command->opcode = DPSOFTRAST_OPCODE_Reset;
841 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845 command->opcode = opcode;
846 command->commandsize = size;
848 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
850 dpsoftrast.commandpool.freecommand = freecommand;
851 dpsoftrast.commandpool.usedcommands = usedcommands + size;
855 static void DPSOFTRAST_UndoCommand(int size)
857 int freecommand = dpsoftrast.commandpool.freecommand;
858 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862 usedcommands -= size;
863 dpsoftrast.commandpool.freecommand = freecommand;
864 dpsoftrast.commandpool.usedcommands = usedcommands;
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
870 thread->viewport[0] = command->x;
871 thread->viewport[1] = command->y;
872 thread->viewport[2] = command->width;
873 thread->viewport[3] = command->height;
874 thread->validate |= DPSOFTRAST_VALIDATE_FB;
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
878 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881 command->width = width;
882 command->height = height;
884 dpsoftrast.viewport[0] = x;
885 dpsoftrast.viewport[1] = y;
886 dpsoftrast.viewport[2] = width;
887 dpsoftrast.viewport[3] = height;
888 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
894 int i, x1, y1, x2, y2, w, h, x, y;
895 int miny1 = thread->miny1;
896 int maxy1 = thread->maxy1;
897 int miny2 = thread->miny2;
898 int maxy2 = thread->maxy2;
902 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903 x1 = thread->fb_scissor[0];
904 y1 = thread->fb_scissor[1];
905 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
906 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
907 if (y1 < miny1) y1 = miny1;
908 if (y2 > maxy2) y2 = maxy2;
913 // FIXME: honor fb_colormask?
914 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915 for (i = 0;i < 4;i++)
917 if (!dpsoftrast.fb_colorpixels[i])
919 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923 for (x = x1;x < x2;x++)
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
930 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
940 int x1, y1, x2, y2, w, h, x, y;
941 int miny1 = thread->miny1;
942 int maxy1 = thread->maxy1;
943 int miny2 = thread->miny2;
944 int maxy2 = thread->maxy2;
948 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949 x1 = thread->fb_scissor[0];
950 y1 = thread->fb_scissor[1];
951 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953 if (y1 < miny1) y1 = miny1;
954 if (y2 > maxy2) y2 = maxy2;
959 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964 for (x = x1;x < x2;x++)
968 void DPSOFTRAST_ClearDepth(float d)
970 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
977 thread->colormask[0] = command->r != 0;
978 thread->colormask[1] = command->g != 0;
979 thread->colormask[2] = command->b != 0;
980 thread->colormask[3] = command->a != 0;
981 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
985 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
995 thread->depthtest = command->enable;
996 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
998 void DPSOFTRAST_DepthTest(int enable)
1000 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001 command->enable = enable;
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1007 thread->scissortest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 void DPSOFTRAST_ScissorTest(int enable)
1012 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013 command->enable = enable;
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1019 thread->scissor[0] = command->x;
1020 thread->scissor[1] = command->y;
1021 thread->scissor[2] = command->width;
1022 thread->scissor[3] = command->height;
1023 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1027 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030 command->width = width;
1031 command->height = height;
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 thread->blendfunc[0] = command->sfactor;
1038 thread->blendfunc[1] = command->dfactor;
1039 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044 command->sfactor = sfactor;
1045 command->dfactor = dfactor;
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1051 thread->blendsubtract = command->enable;
1052 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1054 void DPSOFTRAST_BlendSubtract(int enable)
1056 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057 command->enable = enable;
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1063 thread->depthmask = command->enable;
1065 void DPSOFTRAST_DepthMask(int enable)
1067 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068 command->enable = enable;
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1074 thread->depthfunc = command->func;
1076 void DPSOFTRAST_DepthFunc(int func)
1078 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079 command->func = func;
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1085 thread->depthrange[0] = command->nearval;
1086 thread->depthrange[1] = command->farval;
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1090 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091 command->nearval = nearval;
1092 command->farval = farval;
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1098 thread->polygonoffset[0] = command->alongnormal;
1099 thread->polygonoffset[1] = command->intoview;
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1103 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104 command->alongnormal = alongnormal;
1105 command->intoview = intoview;
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1111 thread->cullface = command->mode;
1113 void DPSOFTRAST_CullFace(int mode)
1115 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116 command->mode = mode;
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 thread->alphatest = command->enable;
1124 void DPSOFTRAST_AlphaTest(int enable)
1126 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127 command->enable = enable;
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1133 thread->alphafunc = command->func;
1134 thread->alphavalue = command->ref;
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1138 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139 command->func = func;
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1145 dpsoftrast.color[0] = r;
1146 dpsoftrast.color[1] = g;
1147 dpsoftrast.color[2] = b;
1148 dpsoftrast.color[3] = a;
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1153 int outstride = blockwidth * 4;
1154 int instride = dpsoftrast.fb_width * 4;
1157 int bx2 = blockx + blockwidth;
1158 int by2 = blocky + blockheight;
1163 unsigned char *inpixels;
1167 if (bx1 < 0) bx1 = 0;
1168 if (by1 < 0) by1 = 0;
1169 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1228 spixels = dpsoftrast.fb_colorpixels[0];
1229 swidth = dpsoftrast.fb_width;
1230 sheight = dpsoftrast.fb_height;
1231 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232 twidth = texture->mipmap[mip][2];
1233 theight = texture->mipmap[mip][3];
1234 if (tx1 < 0) tx1 = 0;
1235 if (ty1 < 0) ty1 = 0;
1236 if (tx2 > twidth) tx2 = twidth;
1237 if (ty2 > theight) ty2 = theight;
1238 if (sx1 < 0) sx1 = 0;
1239 if (sy1 < 0) sy1 = 0;
1240 if (sx2 > swidth) sx2 = swidth;
1241 if (sy2 > sheight) sy2 = sheight;
1246 if (tw > sw) tw = sw;
1247 if (th > sh) th = sh;
1248 if (tw < 1 || th < 1)
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1769 float *end = out4f + numitems*4;
1770 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771 __m128 minpos, maxpos;
1772 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1774 minpos = maxpos = _mm_loadu_ps(in4f);
1777 __m128 v = _mm_loadu_ps(in4f);
1778 minpos = _mm_min_ps(minpos, v);
1779 maxpos = _mm_max_ps(maxpos, v);
1780 _mm_store_ps(out4f, v);
1781 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782 _mm_store_ps(screen4f, v);
1790 minpos = maxpos = _mm_load_ps(in4f);
1793 __m128 v = _mm_load_ps(in4f);
1794 minpos = _mm_min_ps(minpos, v);
1795 maxpos = _mm_max_ps(maxpos, v);
1796 _mm_store_ps(out4f, v);
1797 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798 _mm_store_ps(screen4f, v);
1805 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1806 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1817 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1820 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822 end = out4f + numitems*4;
1823 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825 m0 = _mm_loadu_ps(inmatrix16f);
1826 m1 = _mm_loadu_ps(inmatrix16f + 4);
1827 m2 = _mm_loadu_ps(inmatrix16f + 8);
1828 m3 = _mm_loadu_ps(inmatrix16f + 12);
1829 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1831 minpos = maxpos = _mm_loadu_ps(in4f);
1834 __m128 v = _mm_loadu_ps(in4f);
1835 minpos = _mm_min_ps(minpos, v);
1836 maxpos = _mm_max_ps(maxpos, v);
1837 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838 _mm_store_ps(out4f, v);
1839 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840 _mm_store_ps(screen4f, v);
1848 minpos = maxpos = _mm_load_ps(in4f);
1851 __m128 v = _mm_load_ps(in4f);
1852 minpos = _mm_min_ps(minpos, v);
1853 maxpos = _mm_max_ps(maxpos, v);
1854 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855 _mm_store_ps(out4f, v);
1856 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857 _mm_store_ps(screen4f, v);
1864 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1871 float *outf = dpsoftrast.post_array4f[outarray];
1872 const unsigned char *inb;
1873 int firstvertex = dpsoftrast.firstvertex;
1874 int numvertices = dpsoftrast.numvertices;
1878 case DPSOFTRAST_ARRAY_POSITION:
1879 stride = dpsoftrast.stride_vertex;
1880 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1883 case DPSOFTRAST_ARRAY_COLOR:
1884 stride = dpsoftrast.stride_color;
1885 if (dpsoftrast.pointer_color4f)
1887 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1890 else if (dpsoftrast.pointer_color4ub)
1892 stride = dpsoftrast.stride_color;
1893 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1902 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1905 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1909 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1915 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1934 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1942 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1950 int startx = span->startx;
1951 int endx = span->endx;
1952 float wslope = triangle->w[0];
1953 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954 float endz = 1.0f / (w + wslope * startx);
1955 for (x = startx;x < endx;)
1957 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959 if (nextsub >= endx) nextsub = endsub = endx-1;
1960 endz = 1.0f / (w + wslope * nextsub);
1961 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962 for (; x <= endsub; x++, z += dz)
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1970 int startx = span->startx;
1971 int endx = span->endx;
1974 unsigned char * RESTRICT pixelmask = span->pixelmask;
1975 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1978 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979 // handle alphatest now (this affects depth writes too)
1980 if (thread->alphatest)
1981 for (x = startx;x < endx;x++)
1982 if (in4f[x*4+3] < 0.5f)
1983 pixelmask[x] = false;
1984 // FIXME: this does not handle bigendian
1985 switch(thread->fb_blendmode)
1987 case DPSOFTRAST_BLENDMODE_OPAQUE:
1988 for (x = startx;x < endx;x++)
1992 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ADD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_INVMOD:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_MUL2:
2081 for (x = startx;x < endx;x++)
2085 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089 pixel[x*4+0] = d[0];
2090 pixel[x*4+1] = d[1];
2091 pixel[x*4+2] = d[2];
2092 pixel[x*4+3] = d[3];
2095 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096 for (x = startx;x < endx;x++)
2100 a = in4f[x*4+3] * -255.0f;
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2111 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112 for (x = startx;x < endx;x++)
2117 b = 1.0f - in4f[x*4+3];
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_INVADD:
2129 for (x = startx;x < endx;x++)
2133 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137 pixel[x*4+0] = d[0];
2138 pixel[x*4+1] = d[1];
2139 pixel[x*4+2] = d[2];
2140 pixel[x*4+3] = d[3];
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2150 int startx = span->startx;
2151 int endx = span->endx;
2152 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153 unsigned char * RESTRICT pixelmask = span->pixelmask;
2154 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2158 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159 pixeli += span->y * dpsoftrast.fb_width + span->x;
2160 // handle alphatest now (this affects depth writes too)
2161 if (thread->alphatest)
2162 for (x = startx;x < endx;x++)
2163 if (in4ub[x*4+3] < 0.5f)
2164 pixelmask[x] = false;
2165 // FIXME: this does not handle bigendian
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 for (x = startx;x + 4 <= endx;)
2171 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2173 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187 case DPSOFTRAST_BLENDMODE_ALPHA:
2188 #define FINISHBLEND(blend2, blend1) \
2189 for (x = startx;x + 2 <= endx;x += 2) \
2192 switch (*(const unsigned short*)&pixelmask[x]) \
2195 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2198 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2201 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2204 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2207 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2210 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 for(;x < endx; x++) \
2218 if (!pixelmask[x]) \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2290 int startx = span->startx;
2291 int endx = span->endx;
2296 float tc[2], endtc[2];
2298 unsigned int tci[2];
2299 unsigned int tci1[2];
2300 unsigned int tcimin[2];
2301 unsigned int tcimax[2];
2306 const unsigned char * RESTRICT pixelbase;
2307 const unsigned char * RESTRICT pixel[4];
2308 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2309 // if no texture is bound, just fill it with white
2312 for (x = startx;x < endx;x++)
2314 out4f[x*4+0] = 1.0f;
2315 out4f[x*4+1] = 1.0f;
2316 out4f[x*4+2] = 1.0f;
2317 out4f[x*4+3] = 1.0f;
2321 mip = triangle->mip[texunitindex];
2322 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2323 // if this mipmap of the texture is 1 pixel, just fill it with that color
2324 if (texture->mipmap[mip][1] == 4)
2326 c[0] = texture->bytes[2] * (1.0f/255.0f);
2327 c[1] = texture->bytes[1] * (1.0f/255.0f);
2328 c[2] = texture->bytes[0] * (1.0f/255.0f);
2329 c[3] = texture->bytes[3] * (1.0f/255.0f);
2330 for (x = startx;x < endx;x++)
2332 out4f[x*4+0] = c[0];
2333 out4f[x*4+1] = c[1];
2334 out4f[x*4+2] = c[2];
2335 out4f[x*4+3] = c[3];
2339 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2340 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2341 flags = texture->flags;
2342 tcscale[0] = texture->mipmap[mip][2];
2343 tcscale[1] = texture->mipmap[mip][3];
2344 tciwidth = texture->mipmap[mip][2];
2347 tcimax[0] = texture->mipmap[mip][2]-1;
2348 tcimax[1] = texture->mipmap[mip][3]-1;
2349 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2350 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2351 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2352 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2353 for (x = startx;x < endx;)
2355 unsigned int subtc[2];
2356 unsigned int substep[2];
2357 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2358 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2359 if (nextsub >= endx)
2361 nextsub = endsub = endx-1;
2362 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2366 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2367 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2368 substep[0] = (endtc[0] - tc[0]) * subscale;
2369 substep[1] = (endtc[1] - tc[1]) * subscale;
2370 subtc[0] = tc[0] * (1<<16);
2371 subtc[1] = tc[1] * (1<<16);
2374 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2376 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2378 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2379 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2380 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2381 tci[0] = subtc[0]>>16;
2382 tci[1] = subtc[1]>>16;
2383 tci1[0] = tci[0] + 1;
2384 tci1[1] = tci[1] + 1;
2385 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2386 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2387 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2388 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2389 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2390 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2391 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2392 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2393 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2394 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2397 out4f[x*4+0] = c[0];
2398 out4f[x*4+1] = c[1];
2399 out4f[x*4+2] = c[2];
2400 out4f[x*4+3] = c[3];
2405 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2407 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2408 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2409 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2410 tci[0] = subtc[0]>>16;
2411 tci[1] = subtc[1]>>16;
2412 tci1[0] = tci[0] + 1;
2413 tci1[1] = tci[1] + 1;
2414 tci[0] &= tciwrapmask[0];
2415 tci[1] &= tciwrapmask[1];
2416 tci1[0] &= tciwrapmask[0];
2417 tci1[1] &= tciwrapmask[1];
2418 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2419 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2420 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2421 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2422 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2423 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2426 out4f[x*4+0] = c[0];
2427 out4f[x*4+1] = c[1];
2428 out4f[x*4+2] = c[2];
2429 out4f[x*4+3] = c[3];
2433 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2435 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2437 tci[0] = subtc[0]>>16;
2438 tci[1] = subtc[1]>>16;
2439 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2440 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2441 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2442 c[0] = pixel[0][2] * (1.0f / 255.0f);
2443 c[1] = pixel[0][1] * (1.0f / 255.0f);
2444 c[2] = pixel[0][0] * (1.0f / 255.0f);
2445 c[3] = pixel[0][3] * (1.0f / 255.0f);
2446 out4f[x*4+0] = c[0];
2447 out4f[x*4+1] = c[1];
2448 out4f[x*4+2] = c[2];
2449 out4f[x*4+3] = c[3];
2454 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456 tci[0] = subtc[0]>>16;
2457 tci[1] = subtc[1]>>16;
2458 tci[0] &= tciwrapmask[0];
2459 tci[1] &= tciwrapmask[1];
2460 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461 c[0] = pixel[0][2] * (1.0f / 255.0f);
2462 c[1] = pixel[0][1] * (1.0f / 255.0f);
2463 c[2] = pixel[0][0] * (1.0f / 255.0f);
2464 c[3] = pixel[0][3] * (1.0f / 255.0f);
2465 out4f[x*4+0] = c[0];
2466 out4f[x*4+1] = c[1];
2467 out4f[x*4+2] = c[2];
2468 out4f[x*4+3] = c[3];
2474 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2478 int startx = span->startx;
2479 int endx = span->endx;
2481 __m128 data, slope, tcscale;
2482 __m128i tcsize, tcmask, tcoffset, tcmax;
2484 __m128i subtc, substep, endsubtc;
2487 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2488 const unsigned char * RESTRICT pixelbase;
2489 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2490 // if no texture is bound, just fill it with white
2493 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2496 mip = triangle->mip[texunitindex];
2497 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2498 // if this mipmap of the texture is 1 pixel, just fill it with that color
2499 if (texture->mipmap[mip][1] == 4)
2501 unsigned int k = *((const unsigned int *)pixelbase);
2502 for (x = startx;x < endx;x++)
2506 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2507 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2508 flags = texture->flags;
2509 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2510 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2511 tcscale = _mm_cvtepi32_ps(tcsize);
2512 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2513 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2514 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2515 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2516 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2517 tcmax = _mm_packs_epi32(tcmask, tcmask);
2518 for (x = startx;x < endx;)
2520 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2521 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2522 if (nextsub >= endx)
2524 nextsub = endsub = endx-1;
2525 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2529 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2530 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2531 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2532 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2533 substep = _mm_slli_epi32(substep, 1);
2536 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2537 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2539 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2540 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2542 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2543 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2544 tci = _mm_madd_epi16(tci, tcoffset);
2545 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2546 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2547 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2548 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2549 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2550 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2551 fracm = _mm_srli_epi16(subtc, 1);
2552 pix1 = _mm_add_epi16(pix1,
2553 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2554 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2555 pix3 = _mm_add_epi16(pix3,
2556 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2557 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2558 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2559 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2560 pix2 = _mm_add_epi16(pix2,
2561 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2562 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2563 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2567 const unsigned char * RESTRICT ptr1;
2568 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2569 tci = _mm_madd_epi16(tci, tcoffset);
2570 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2571 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2572 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2573 fracm = _mm_srli_epi16(subtc, 1);
2574 pix1 = _mm_add_epi16(pix1,
2575 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2576 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2577 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2578 pix1 = _mm_add_epi16(pix1,
2579 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2580 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2581 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2585 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2587 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2589 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2590 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2591 tci = _mm_madd_epi16(tci, tcoffset);
2592 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2593 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2594 _mm_setzero_si128());
2595 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2596 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2597 _mm_setzero_si128());
2598 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2599 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2600 tci = _mm_madd_epi16(tci, tcoffset);
2601 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603 _mm_setzero_si128());
2604 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606 _mm_setzero_si128());
2607 fracm = _mm_srli_epi16(subtc, 1);
2608 pix1 = _mm_add_epi16(pix1,
2609 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2610 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2611 pix3 = _mm_add_epi16(pix3,
2612 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2613 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2614 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2615 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2616 pix2 = _mm_add_epi16(pix2,
2617 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2618 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2619 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2623 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2624 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2625 tci = _mm_madd_epi16(tci, tcoffset);
2626 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628 _mm_setzero_si128());
2629 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631 _mm_setzero_si128());
2632 fracm = _mm_srli_epi16(subtc, 1);
2633 pix1 = _mm_add_epi16(pix1,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2637 pix1 = _mm_add_epi16(pix1,
2638 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2640 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2646 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2648 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2649 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2650 tci = _mm_madd_epi16(tci, tcoffset);
2651 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2652 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2653 _mm_setzero_si128());
2654 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2655 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2656 _mm_setzero_si128());
2657 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2658 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 fracm = _mm_srli_epi16(subtc, 1);
2667 pix1 = _mm_add_epi16(pix1,
2668 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670 pix3 = _mm_add_epi16(pix3,
2671 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2672 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2673 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2674 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2675 pix2 = _mm_add_epi16(pix2,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2677 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2678 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2682 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2683 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684 tci = _mm_madd_epi16(tci, tcoffset);
2685 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687 _mm_setzero_si128());
2688 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690 _mm_setzero_si128());
2691 fracm = _mm_srli_epi16(subtc, 1);
2692 pix1 = _mm_add_epi16(pix1,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2696 pix1 = _mm_add_epi16(pix1,
2697 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2699 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2706 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2708 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2710 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2711 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2712 tci = _mm_madd_epi16(tci, tcoffset);
2713 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2714 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2718 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2719 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2720 tci = _mm_madd_epi16(tci, tcoffset);
2721 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2727 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2729 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2730 tci = _mm_and_si128(tci, tcmax);
2731 tci = _mm_madd_epi16(tci, tcoffset);
2732 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2733 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2737 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2738 tci = _mm_and_si128(tci, tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2749 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2752 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2755 float DPSOFTRAST_SampleShadowmap(const float *vector)
2761 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2764 int startx = span->startx;
2765 int endx = span->endx;
2770 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2771 for (x = startx;x < endx;x++)
2774 c[0] = (data[0] + slope[0]*x) * z;
2775 c[1] = (data[1] + slope[1]*x) * z;
2776 c[2] = (data[2] + slope[2]*x) * z;
2777 c[3] = (data[3] + slope[3]*x) * z;
2778 out4f[x*4+0] = in4f[x*4+0] * c[0];
2779 out4f[x*4+1] = in4f[x*4+1] * c[1];
2780 out4f[x*4+2] = in4f[x*4+2] * c[2];
2781 out4f[x*4+3] = in4f[x*4+3] * c[3];
2785 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2788 int startx = span->startx;
2789 int endx = span->endx;
2794 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2795 for (x = startx;x < endx;x++)
2798 c[0] = (data[0] + slope[0]*x) * z;
2799 c[1] = (data[1] + slope[1]*x) * z;
2800 c[2] = (data[2] + slope[2]*x) * z;
2801 c[3] = (data[3] + slope[3]*x) * z;
2802 out4f[x*4+0] = c[0];
2803 out4f[x*4+1] = c[1];
2804 out4f[x*4+2] = c[2];
2805 out4f[x*4+3] = c[3];
2809 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2811 int x, startx = span->startx, endx = span->endx;
2812 float c[4], localcolor[4];
2813 localcolor[0] = subcolor[0];
2814 localcolor[1] = subcolor[1];
2815 localcolor[2] = subcolor[2];
2816 localcolor[3] = subcolor[3];
2817 for (x = startx;x < endx;x++)
2819 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2820 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2821 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2822 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2823 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2824 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2825 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2826 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2830 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2832 int x, startx = span->startx, endx = span->endx;
2833 for (x = startx;x < endx;x++)
2835 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2836 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2837 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2838 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2842 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2844 int x, startx = span->startx, endx = span->endx;
2845 for (x = startx;x < endx;x++)
2847 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2848 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2849 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2850 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2854 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2856 int x, startx = span->startx, endx = span->endx;
2858 for (x = startx;x < endx;x++)
2860 a = 1.0f - inb4f[x*4+3];
2862 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2863 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2864 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2865 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2869 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2871 int x, startx = span->startx, endx = span->endx;
2872 float localcolor[4], ilerp, lerp;
2873 localcolor[0] = color[0];
2874 localcolor[1] = color[1];
2875 localcolor[2] = color[2];
2876 localcolor[3] = color[3];
2877 ilerp = 1.0f - localcolor[3];
2878 lerp = localcolor[3];
2879 for (x = startx;x < endx;x++)
2881 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2882 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2883 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2884 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2890 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2894 int startx = span->startx;
2895 int endx = span->endx;
2898 __m128i submod, substep, endsubmod;
2899 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2900 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2901 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2902 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2903 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2904 for (x = startx; x < endx;)
2906 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2907 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2908 if (nextsub >= endx)
2910 nextsub = endsub = endx-1;
2911 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2915 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2916 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2917 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2918 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2919 substep = _mm_packs_epi32(substep, substep);
2920 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2922 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2923 pix = _mm_mulhi_epu16(pix, submod);
2924 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2928 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2929 pix = _mm_mulhi_epu16(pix, submod);
2930 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2937 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2941 int startx = span->startx;
2942 int endx = span->endx;
2945 __m128i submod, substep, endsubmod;
2946 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2947 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2948 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2949 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2950 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2951 for (x = startx; x < endx;)
2953 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2954 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2955 if (nextsub >= endx)
2957 nextsub = endsub = endx-1;
2958 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2962 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2963 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2964 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2965 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2966 substep = _mm_packs_epi32(substep, substep);
2967 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2969 __m128i pix = _mm_srai_epi16(submod, 4);
2970 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2974 __m128i pix = _mm_srai_epi16(submod, 4);
2975 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2982 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2985 int x, startx = span->startx, endx = span->endx;
2986 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2987 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2988 for (x = startx;x+2 <= endx;x+=2)
2990 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2991 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2992 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2993 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2997 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2998 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2999 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3000 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3005 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3008 int x, startx = span->startx, endx = span->endx;
3009 for (x = startx;x+2 <= endx;x+=2)
3011 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3012 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3013 pix1 = _mm_mulhi_epu16(pix1, pix2);
3014 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3018 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3019 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3020 pix1 = _mm_mulhi_epu16(pix1, pix2);
3021 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3026 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3029 int x, startx = span->startx, endx = span->endx;
3030 for (x = startx;x+2 <= endx;x+=2)
3032 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3033 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3034 pix1 = _mm_add_epi16(pix1, pix2);
3035 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3039 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3040 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3041 pix1 = _mm_add_epi16(pix1, pix2);
3042 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3047 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3050 int x, startx = span->startx, endx = span->endx;
3051 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3052 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3053 for (x = startx;x+2 <= endx;x+=2)
3055 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3056 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3057 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3058 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3062 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3063 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3064 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3065 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3070 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3073 int x, startx = span->startx, endx = span->endx;
3074 for (x = startx;x+2 <= endx;x+=2)
3076 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3077 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3078 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3079 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3080 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3086 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3087 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3093 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3096 int x, startx = span->startx, endx = span->endx;
3097 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3098 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3099 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3100 for (x = startx;x+2 <= endx;x+=2)
3102 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3103 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3104 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3108 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3109 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3110 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3117 void DPSOFTRAST_VertexShader_Generic(void)
3119 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3120 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3121 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3122 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3123 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3126 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3128 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3129 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3133 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3135 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3136 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3137 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3139 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3140 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3143 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3145 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3148 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3150 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3153 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3159 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3164 void DPSOFTRAST_VertexShader_PostProcess(void)
3166 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3167 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3168 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3171 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3173 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3174 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3175 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3178 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3179 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3181 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3182 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3184 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3185 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3187 // TODO: implement saturation
3189 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3191 // TODO: implement gammaramps
3193 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3198 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3200 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3203 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3205 // this is never called (because colormask is off when this shader is used)
3206 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3207 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3209 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3210 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3215 void DPSOFTRAST_VertexShader_FlatColor(void)
3217 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3218 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3221 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 int x, startx = span->startx, endx = span->endx;
3224 int Color_Ambienti[4];
3225 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3229 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3230 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3231 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3232 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234 for (x = startx;x < endx;x++)
3236 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3237 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3238 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3239 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3241 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3246 void DPSOFTRAST_VertexShader_VertexColor(void)
3248 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3249 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3250 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3253 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3256 unsigned char * RESTRICT pixelmask = span->pixelmask;
3257 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3258 int x, startx = span->startx, endx = span->endx;
3259 __m128i Color_Ambientm, Color_Diffusem;
3261 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3262 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3263 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3265 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3266 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3267 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3268 pixel = buffer_FragColorbgra8;
3269 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3270 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3271 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3272 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3273 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3274 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3275 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3276 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3277 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3278 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3279 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3280 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3281 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3282 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3284 __m128i color, mod, pix;
3285 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3288 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3289 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3290 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3291 data = _mm_add_ps(data, slope);
3292 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3293 data = _mm_add_ps(data, slope);
3294 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3295 data = _mm_add_ps(data, slope);
3296 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3297 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3298 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3299 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3300 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3301 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3307 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3308 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3309 mod = _mm_packs_epi32(mod, mod);
3310 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3311 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3313 if (pixel == buffer_FragColorbgra8)
3314 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3320 void DPSOFTRAST_VertexShader_Lightmap(void)
3322 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3323 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3324 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3327 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3330 unsigned char * RESTRICT pixelmask = span->pixelmask;
3331 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3332 int x, startx = span->startx, endx = span->endx;
3333 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3334 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3335 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3336 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3340 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3341 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3342 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3343 pixel = buffer_FragColorbgra8;
3344 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3345 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3346 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3347 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3348 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3349 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3350 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3351 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3353 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3354 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3355 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3356 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3357 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3358 for (x = startx;x < endx;x++)
3360 __m128i color, lightmap, glow, pix;
3361 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3364 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3365 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3366 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3367 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3368 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3369 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3370 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3371 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3372 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3373 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3379 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3380 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3381 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3382 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3383 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3384 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3389 for (x = startx;x < endx;x++)
3391 __m128i color, lightmap, pix;
3392 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3395 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3396 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3397 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3398 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3399 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3400 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3401 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3407 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3410 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3413 if (pixel == buffer_FragColorbgra8)
3414 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3420 void DPSOFTRAST_VertexShader_FakeLight(void)
3422 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3425 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3428 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3429 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3431 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3432 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3437 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3439 DPSOFTRAST_VertexShader_Lightmap();
3442 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3444 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3450 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3452 DPSOFTRAST_VertexShader_Lightmap();
3455 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3457 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3463 void DPSOFTRAST_VertexShader_LightDirection(void)
3466 int numvertices = dpsoftrast.numvertices;
3468 float LightVector[4];
3469 float EyePosition[4];
3470 float EyeVectorModelSpace[4];
3476 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3477 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3478 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3479 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3480 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3481 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3482 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3483 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3484 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3485 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3486 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3487 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3488 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3489 for (i = 0;i < numvertices;i++)
3491 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3492 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3493 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3494 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3495 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3496 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3497 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3498 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3499 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3500 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3501 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3502 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3503 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3504 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3505 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3506 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3507 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3508 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3509 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3510 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3511 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3512 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3513 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3514 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3515 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3516 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3517 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3518 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3519 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3521 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3524 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3525 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3526 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3527 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3528 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3529 #define DPSOFTRAST_Vector3Normalize(v)\
3532 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3543 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3545 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3546 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3551 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3552 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3553 int x, startx = span->startx, endx = span->endx;
3554 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3555 float LightVectordata[4];
3556 float LightVectorslope[4];
3557 float EyeVectordata[4];
3558 float EyeVectorslope[4];
3560 float diffusetex[4];
3562 float surfacenormal[4];
3563 float lightnormal[4];
3565 float specularnormal[4];
3568 float SpecularPower;
3570 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3571 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3572 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3573 Color_Glow[3] = 0.0f;
3574 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3575 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3576 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3577 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3578 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3579 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3580 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3581 Color_Pants[3] = 0.0f;
3582 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3583 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3584 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3585 Color_Shirt[3] = 0.0f;
3586 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3587 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3588 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3590 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3591 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3593 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3595 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3597 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3599 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3600 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3601 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3602 Color_Diffuse[3] = 0.0f;
3603 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3604 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3605 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3606 LightColor[3] = 0.0f;
3607 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3608 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3610 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3611 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3612 Color_Specular[3] = 0.0f;
3613 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3614 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3615 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616 for (x = startx;x < endx;x++)
3619 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3620 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3621 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3622 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3623 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3625 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3626 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3627 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3628 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3630 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3631 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3632 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3633 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3634 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3635 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3636 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3637 DPSOFTRAST_Vector3Normalize(surfacenormal);
3639 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3640 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3641 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3642 DPSOFTRAST_Vector3Normalize(lightnormal);
3644 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3645 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3646 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3647 DPSOFTRAST_Vector3Normalize(eyenormal);
3649 specularnormal[0] = lightnormal[0] + eyenormal[0];
3650 specularnormal[1] = lightnormal[1] + eyenormal[1];
3651 specularnormal[2] = lightnormal[2] + eyenormal[2];
3652 DPSOFTRAST_Vector3Normalize(specularnormal);
3654 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3655 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3656 specular = pow(specular, SpecularPower * glosstex[3]);
3657 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3659 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3660 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3661 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3662 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3666 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3667 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3668 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3669 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3671 buffer_FragColorbgra8[x*4+0] = d[0];
3672 buffer_FragColorbgra8[x*4+1] = d[1];
3673 buffer_FragColorbgra8[x*4+2] = d[2];
3674 buffer_FragColorbgra8[x*4+3] = d[3];
3677 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3679 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3680 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3681 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3682 Color_Diffuse[3] = 0.0f;
3683 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3684 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3685 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3686 LightColor[3] = 0.0f;
3687 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3688 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3689 for (x = startx;x < endx;x++)
3692 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3693 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3694 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3695 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3696 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3697 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3698 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3699 DPSOFTRAST_Vector3Normalize(surfacenormal);
3701 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3702 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3703 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3704 DPSOFTRAST_Vector3Normalize(lightnormal);
3706 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3707 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3709 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3710 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3711 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3712 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3716 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3717 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3718 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3719 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3721 buffer_FragColorbgra8[x*4+0] = d[0];
3722 buffer_FragColorbgra8[x*4+1] = d[1];
3723 buffer_FragColorbgra8[x*4+2] = d[2];
3724 buffer_FragColorbgra8[x*4+3] = d[3];
3729 for (x = startx;x < endx;x++)
3732 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3733 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3734 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3735 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3737 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3739 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3740 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3741 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3742 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3746 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3747 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3748 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3749 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3751 buffer_FragColorbgra8[x*4+0] = d[0];
3752 buffer_FragColorbgra8[x*4+1] = d[1];
3753 buffer_FragColorbgra8[x*4+2] = d[2];
3754 buffer_FragColorbgra8[x*4+3] = d[3];
3757 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3762 void DPSOFTRAST_VertexShader_LightSource(void)
3765 int numvertices = dpsoftrast.numvertices;
3766 float LightPosition[4];
3767 float LightVector[4];
3768 float LightVectorModelSpace[4];
3769 float EyePosition[4];
3770 float EyeVectorModelSpace[4];
3776 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3777 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3778 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3779 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3780 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3781 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3782 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3783 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3784 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3785 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3786 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3787 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3788 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3789 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3790 for (i = 0;i < numvertices;i++)
3792 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3793 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3794 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3795 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3796 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3797 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3798 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3799 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3800 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3801 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3802 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3803 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3804 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3805 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3806 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3807 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3808 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3809 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3810 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3811 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3812 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3813 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3814 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3815 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3816 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3817 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3818 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3819 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3820 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3821 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3822 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3823 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3825 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3826 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3829 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3832 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3833 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3834 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3835 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3836 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3837 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3838 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3839 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3840 int x, startx = span->startx, endx = span->endx;
3841 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3842 float CubeVectordata[4];
3843 float CubeVectorslope[4];
3844 float LightVectordata[4];
3845 float LightVectorslope[4];
3846 float EyeVectordata[4];
3847 float EyeVectorslope[4];
3849 float diffusetex[4];
3851 float surfacenormal[4];
3852 float lightnormal[4];
3854 float specularnormal[4];
3857 float SpecularPower;
3858 float CubeVector[4];
3861 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3862 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3863 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3864 Color_Glow[3] = 0.0f;
3865 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3866 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3867 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3868 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3869 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3870 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3871 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3872 Color_Diffuse[3] = 0.0f;
3873 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3874 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3875 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3876 Color_Specular[3] = 0.0f;
3877 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3878 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3879 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3880 Color_Pants[3] = 0.0f;
3881 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3882 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3883 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3884 Color_Shirt[3] = 0.0f;
3885 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3886 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3887 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3888 LightColor[3] = 0.0f;
3889 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3890 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3891 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3892 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3893 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3894 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3895 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3896 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3898 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3899 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3901 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3902 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3903 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3905 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3906 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3907 for (x = startx;x < endx;x++)
3910 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3911 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3912 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3913 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3914 if (attenuation < 0.01f)
3916 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3918 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3919 if (attenuation < 0.01f)
3923 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3924 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3925 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3926 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3927 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3929 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3930 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3931 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3932 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3934 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3935 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3936 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3937 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3938 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3939 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3940 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3941 DPSOFTRAST_Vector3Normalize(surfacenormal);
3943 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3944 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3945 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3946 DPSOFTRAST_Vector3Normalize(lightnormal);
3948 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3949 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3950 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3951 DPSOFTRAST_Vector3Normalize(eyenormal);
3953 specularnormal[0] = lightnormal[0] + eyenormal[0];
3954 specularnormal[1] = lightnormal[1] + eyenormal[1];
3955 specularnormal[2] = lightnormal[2] + eyenormal[2];
3956 DPSOFTRAST_Vector3Normalize(specularnormal);
3958 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3959 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3960 specular = pow(specular, SpecularPower * glosstex[3]);
3961 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3963 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3964 attenuation *= (1.0f / 255.0f);
3965 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3966 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3967 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3968 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3972 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3973 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3974 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3975 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3977 buffer_FragColorbgra8[x*4+0] = d[0];
3978 buffer_FragColorbgra8[x*4+1] = d[1];
3979 buffer_FragColorbgra8[x*4+2] = d[2];
3980 buffer_FragColorbgra8[x*4+3] = d[3];
3983 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3985 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3986 for (x = startx;x < endx;x++)
3989 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3990 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3991 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3992 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3993 if (attenuation < 0.01f)
3995 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3997 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3998 if (attenuation < 0.01f)
4002 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4003 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4004 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4005 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4006 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4008 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4009 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4010 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4011 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4013 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4014 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4015 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4016 DPSOFTRAST_Vector3Normalize(surfacenormal);
4018 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4019 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4020 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4021 DPSOFTRAST_Vector3Normalize(lightnormal);
4023 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4024 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4026 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4027 attenuation *= (1.0f / 255.0f);
4028 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4029 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4030 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4031 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4035 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4036 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4037 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4038 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4040 buffer_FragColorbgra8[x*4+0] = d[0];
4041 buffer_FragColorbgra8[x*4+1] = d[1];
4042 buffer_FragColorbgra8[x*4+2] = d[2];
4043 buffer_FragColorbgra8[x*4+3] = d[3];
4048 for (x = startx;x < endx;x++)
4051 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4052 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4053 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4054 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4055 if (attenuation < 0.01f)
4057 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4059 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4060 if (attenuation < 0.01f)
4064 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4065 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4066 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4067 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4068 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4070 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4071 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4072 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4073 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4075 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4077 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4078 attenuation *= (1.0f / 255.0f);
4079 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4080 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4081 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4082 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4086 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4087 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4088 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4089 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4091 buffer_FragColorbgra8[x*4+0] = d[0];
4092 buffer_FragColorbgra8[x*4+1] = d[1];
4093 buffer_FragColorbgra8[x*4+2] = d[2];
4094 buffer_FragColorbgra8[x*4+3] = d[3];
4097 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4103 void DPSOFTRAST_VertexShader_Refraction(void)
4105 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4108 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4111 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4112 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4113 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4114 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4115 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4120 void DPSOFTRAST_VertexShader_Water(void)
4122 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4138 void DPSOFTRAST_VertexShader_ShowDepth(void)
4140 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4143 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4146 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4147 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4149 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4150 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4155 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4157 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4160 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4163 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4164 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4165 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4166 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4167 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4172 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4174 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4177 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4180 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4181 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4182 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4183 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4184 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4189 typedef struct DPSOFTRAST_ShaderModeInfo_s
4192 void (*Vertex)(void);
4193 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4194 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4195 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4197 DPSOFTRAST_ShaderModeInfo;
4199 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4201 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4202 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4203 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4204 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4205 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4206 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4207 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4208 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4209 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4210 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4211 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4212 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4213 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4214 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4215 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4216 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4219 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4226 // unsigned int *colorpixel;
4227 unsigned int *depthpixel;
4233 DPSOFTRAST_State_Triangle *triangle;
4234 DPSOFTRAST_State_Span *span;
4235 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4236 for (i = 0; i < thread->numspans; i++)
4238 span = &thread->spans[i];
4239 triangle = &thread->triangles[span->triangle];
4240 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4242 wslope = triangle->w[0];
4243 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4244 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4245 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4246 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4247 startx = span->startx;
4249 switch(thread->fb_depthfunc)
4252 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4253 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4254 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4255 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4256 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4257 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4258 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4260 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4261 //for (x = startx;x < endx;x++)
4262 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4263 // if there is no color buffer, skip pixel shader
4264 while (startx < endx && !pixelmask[startx])
4266 while (endx > startx && !pixelmask[endx-1])
4269 continue; // no pixels to fill
4270 span->pixelmask = pixelmask;
4271 span->startx = startx;
4273 // run pixel shader if appropriate
4274 // do this before running depthmask code, to allow the pixelshader
4275 // to clear pixelmask values for alpha testing
4276 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4277 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4278 if (thread->depthmask)
4279 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4285 // no depth testing means we're just dealing with color...
4286 // if there is no color buffer, skip pixel shader
4287 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4289 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4290 span->pixelmask = pixelmask;
4291 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4295 thread->numspans = 0;
4298 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4300 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4303 int cullface = thread->cullface;
4304 int minx, maxx, miny, maxy;
4305 int miny1, maxy1, miny2, maxy2;
4306 __m128i fbmin, fbmax;
4307 __m128 viewportcenter, viewportscale;
4308 int firstvertex = command->firstvertex;
4309 int numvertices = command->numvertices;
4310 int numtriangles = command->numtriangles;
4311 const int *element3i = command->element3i;
4312 const unsigned short *element3s = command->element3s;
4313 int clipped = command->clipped;
4320 int starty, endy, bandy;
4324 __m128 triangleedge1, triangleedge2, trianglenormal;
4327 DPSOFTRAST_State_Triangle *triangle;
4328 DPSOFTRAST_Texture *texture;
4329 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4330 miny = thread->fb_scissor[1];
4331 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4332 miny1 = bound(miny, thread->miny1, maxy);
4333 maxy1 = bound(miny, thread->maxy1, maxy);
4334 miny2 = bound(miny, thread->miny2, maxy);
4335 maxy2 = bound(miny, thread->maxy2, maxy);
4336 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4338 if (!ATOMIC_DECREMENT(command->refcount))
4340 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4341 MM_FREE(command->arrays);
4345 minx = thread->fb_scissor[0];
4346 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4347 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4348 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4349 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4350 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4351 screen[3] = _mm_setzero_ps();
4352 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4353 for (i = 0;i < numtriangles;i++)
4355 const float *screencoord4f = command->arrays;
4356 const float *arrays = screencoord4f + numvertices*4;
4358 // generate the 3 edges of this triangle
4359 // generate spans for the triangle - switch based on left split or right split classification of triangle
4362 e[0] = element3s[i*3+0] - firstvertex;
4363 e[1] = element3s[i*3+1] - firstvertex;
4364 e[2] = element3s[i*3+2] - firstvertex;
4368 e[0] = element3i[i*3+0] - firstvertex;
4369 e[1] = element3i[i*3+1] - firstvertex;
4370 e[2] = element3i[i*3+2] - firstvertex;
4379 #define SKIPBACKFACE \
4380 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4381 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4382 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4383 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4384 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4388 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4392 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4397 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4398 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4400 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4401 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4403 #define CLIPPEDVERTEXCOPY(k,p1) \
4404 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4406 #define GENATTRIBCOPY(attrib, p1) \
4407 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4408 #define GENATTRIBLERP(attrib, p1, p2) \
4410 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4411 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4413 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4417 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4418 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4419 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4420 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4421 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4422 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4423 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4429 // calculate distance from nearplane
4430 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4431 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4432 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4433 if (clipdist[0] >= 0.0f)
4435 if (clipdist[1] >= 0.0f)
4437 if (clipdist[2] >= 0.0f)
4440 // triangle is entirely in front of nearplane
4441 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4448 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4456 if (clipdist[2] >= 0.0f)
4458 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4465 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4472 else if (clipdist[1] >= 0.0f)
4474 if (clipdist[2] >= 0.0f)
4476 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4483 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4489 else if (clipdist[2] >= 0.0f)
4491 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4496 else continue; // triangle is entirely behind nearplane
4499 // calculate integer y coords for triangle points
4500 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4501 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4502 screenmin = _mm_min_epi16(screeni, screenir),
4503 screenmax = _mm_max_epi16(screeni, screenir);
4504 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4505 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4506 screenmin = _mm_max_epi16(screenmin, fbmin);
4507 screenmax = _mm_min_epi16(screenmax, fbmax);
4508 // skip offscreen triangles
4509 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4511 starty = _mm_extract_epi16(screenmin, 1);
4512 endy = _mm_extract_epi16(screenmax, 1)+1;
4513 if (starty >= maxy1 && endy <= miny2)
4515 screeny = _mm_srai_epi32(screeni, 16);
4518 triangle = &thread->triangles[thread->numtriangles];
4520 // calculate attribute plans for triangle data...
4521 // okay, this triangle is going to produce spans, we'd better project
4522 // the interpolants now (this is what gives perspective texturing),
4523 // this consists of simply multiplying all arrays by the W coord
4524 // (which is basically 1/Z), which will be undone per-pixel
4525 // (multiplying by Z again) to get the perspective-correct array
4528 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4529 __m128 mipedgescale, mipdensity;
4530 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4531 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4532 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4533 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4534 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4535 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4536 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4537 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4538 attribedge1 = _mm_sub_ss(w0, w1);
4539 attribedge2 = _mm_sub_ss(w2, w1);
4540 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4541 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4542 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4543 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4544 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4545 _mm_store_ss(&triangle->w[0], attribxslope);
4546 _mm_store_ss(&triangle->w[1], attribyslope);
4547 _mm_store_ss(&triangle->w[2], attriborigin);
4548 mipedgescale = _mm_setzero_ps();
4549 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4551 __m128 attrib0, attrib1, attrib2;
4552 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4553 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4555 arrays += numvertices*4;
4556 GENATTRIBS(attrib0, attrib1, attrib2);
4557 attriborigin = _mm_mul_ps(attrib1, w1);
4558 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4559 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4560 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4561 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4562 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4563 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4564 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4565 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4566 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4568 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4569 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4570 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4571 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4575 memset(triangle->mip, 0, sizeof(triangle->mip));
4576 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4578 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4579 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4581 texture = thread->texbound[texunit];
4582 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4584 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4585 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4586 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4587 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4588 // this will be multiplied in the texturing routine by the texture resolution
4589 y = _mm_cvtss_si32(mipdensity);
4592 y = (int)(log((float)y)*0.5f/M_LN2);
4593 if (y > texture->mipmaps - 1)
4594 y = texture->mipmaps - 1;
4595 triangle->mip[texunit] = y;
4601 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4604 __m128 xcoords, xslope;
4605 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4606 int yccmask = _mm_movemask_epi8(ycc);
4607 int edge0p, edge0n, edge1p, edge1n;
4614 case 0xFFFF: /*0000*/ y = endy; continue;
4615 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4616 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4617 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4618 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4619 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4620 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4621 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4622 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4623 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4624 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4625 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4626 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4627 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4628 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4629 case 0x0000: /*1111*/ y++; continue;
4637 case 0xFFFF: /*000*/ y = endy; continue;
4638 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4639 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4640 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4641 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4642 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4643 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4644 case 0x0000: /*111*/ y++; continue;
4647 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4648 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4649 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4650 nexty = _mm_extract_epi16(ycc, 0);
4651 if (nexty >= bandy) nexty = bandy-1;
4652 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4661 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4662 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4663 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4664 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4665 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4666 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4668 int startx, endx, offset;
4669 startx = _mm_cvtss_si32(xcoords);
4670 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4673 if (startx < 0) startx = 0;
4674 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4676 if (endx > maxx) endx = maxx;
4677 if (startx >= endx) continue;
4678 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4680 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4681 span->triangle = thread->numtriangles;
4684 span->startx = max(minx - offset, 0);
4685 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4686 if (span->startx >= span->endx)
4688 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4689 DPSOFTRAST_Draw_ProcessSpans(thread);
4694 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4696 DPSOFTRAST_Draw_ProcessSpans(thread);
4697 thread->numtriangles = 0;
4701 if (!ATOMIC_DECREMENT(command->refcount))
4703 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4704 MM_FREE(command->arrays);
4707 if (thread->numspans > 0 || thread->numtriangles > 0)
4709 DPSOFTRAST_Draw_ProcessSpans(thread);
4710 thread->numtriangles = 0;
4715 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4719 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4720 int datasize = 2*numvertices*sizeof(float[4]);
4721 DPSOFTRAST_Command_Draw *command;
4722 unsigned char *data;
4723 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4725 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4726 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4728 datasize += numvertices*sizeof(float[4]);
4731 datasize += numtriangles*sizeof(unsigned short[3]);
4733 datasize += numtriangles*sizeof(int[3]);
4734 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4735 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4737 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4738 data = (unsigned char *)MM_CALLOC(datasize, 1);
4742 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4743 data = (unsigned char *)command + commandsize;
4745 command->firstvertex = firstvertex;
4746 command->numvertices = numvertices;
4747 command->numtriangles = numtriangles;
4748 command->arrays = (float *)data;
4749 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4750 dpsoftrast.firstvertex = firstvertex;
4751 dpsoftrast.numvertices = numvertices;
4752 dpsoftrast.screencoord4f = (float *)data;
4753 data += numvertices*sizeof(float[4]);
4754 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4755 data += numvertices*sizeof(float[4]);
4756 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4758 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4759 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4761 dpsoftrast.post_array4f[j] = (float *)data;
4762 data += numvertices*sizeof(float[4]);
4764 command->element3i = NULL;
4765 command->element3s = NULL;
4768 command->element3s = (unsigned short *)data;
4769 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4773 command->element3i = (int *)data;
4774 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4779 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4781 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4782 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4783 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4784 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4785 if (command->starty >= command->endy)
4787 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4788 MM_FREE(command->arrays);
4789 DPSOFTRAST_UndoCommand(command->commandsize);
4792 command->clipped = dpsoftrast.drawclipped;
4793 command->refcount = dpsoftrast.numthreads;
4796 DPSOFTRAST_Draw_SyncCommands();
4799 for (i = 0; i < dpsoftrast.numthreads; i++)
4801 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4802 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4803 SDL_CondSignal(thread->drawcond);
4807 DPSOFTRAST_Draw_FlushThreads();
4811 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4813 int commandoffset = thread->commandoffset;
4814 while (commandoffset != endoffset)
4816 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4817 switch (command->opcode)
4819 #define INTERPCOMMAND(name) \
4820 case DPSOFTRAST_OPCODE_##name : \
4821 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4822 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4823 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4824 commandoffset = 0; \
4826 INTERPCOMMAND(Viewport)
4827 INTERPCOMMAND(ClearColor)
4828 INTERPCOMMAND(ClearDepth)
4829 INTERPCOMMAND(ColorMask)
4830 INTERPCOMMAND(DepthTest)
4831 INTERPCOMMAND(ScissorTest)
4832 INTERPCOMMAND(Scissor)
4833 INTERPCOMMAND(BlendFunc)
4834 INTERPCOMMAND(BlendSubtract)
4835 INTERPCOMMAND(DepthMask)
4836 INTERPCOMMAND(DepthFunc)
4837 INTERPCOMMAND(DepthRange)
4838 INTERPCOMMAND(PolygonOffset)
4839 INTERPCOMMAND(CullFace)
4840 INTERPCOMMAND(AlphaTest)
4841 INTERPCOMMAND(AlphaFunc)
4842 INTERPCOMMAND(SetTexture)
4843 INTERPCOMMAND(SetShader)
4844 INTERPCOMMAND(Uniform4f)
4845 INTERPCOMMAND(UniformMatrix4f)
4846 INTERPCOMMAND(Uniform1i)
4848 case DPSOFTRAST_OPCODE_Draw:
4849 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4850 commandoffset += command->commandsize;
4851 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4853 thread->commandoffset = commandoffset;
4856 case DPSOFTRAST_OPCODE_Reset:
4861 thread->commandoffset = commandoffset;
4865 static int DPSOFTRAST_Draw_Thread(void *data)
4867 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4868 while(thread->index >= 0)
4870 if (thread->commandoffset != dpsoftrast.drawcommand)
4872 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4876 SDL_LockMutex(thread->drawmutex);
4877 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4879 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4880 thread->starving = true;
4881 SDL_CondWait(thread->drawcond, thread->drawmutex);
4882 thread->starving = false;
4884 SDL_UnlockMutex(thread->drawmutex);
4891 static void DPSOFTRAST_Draw_FlushThreads(void)
4893 DPSOFTRAST_State_Thread *thread;
4895 DPSOFTRAST_Draw_SyncCommands();
4897 for (i = 0; i < dpsoftrast.numthreads; i++)
4899 thread = &dpsoftrast.threads[i];
4900 if (thread->commandoffset != dpsoftrast.drawcommand)
4902 SDL_LockMutex(thread->drawmutex);
4903 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4904 SDL_CondSignal(thread->drawcond);
4905 SDL_UnlockMutex(thread->drawmutex);
4909 for (i = 0; i < dpsoftrast.numthreads; i++)
4911 thread = &dpsoftrast.threads[i];
4913 if (thread->commandoffset != dpsoftrast.drawcommand)
4915 SDL_LockMutex(thread->drawmutex);
4916 if (thread->commandoffset != dpsoftrast.drawcommand)
4918 thread->waiting = true;
4919 SDL_CondWait(thread->waitcond, thread->drawmutex);
4920 thread->waiting = false;
4922 SDL_UnlockMutex(thread->drawmutex);
4925 if (thread->commandoffset != dpsoftrast.drawcommand)
4926 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4929 dpsoftrast.commandpool.usedcommands = 0;
4932 void DPSOFTRAST_Flush(void)
4934 DPSOFTRAST_Draw_FlushThreads();
4937 void DPSOFTRAST_Finish(void)
4942 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4952 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4953 dpsoftrast.bigendian = u.b[3];
4954 dpsoftrast.fb_width = width;
4955 dpsoftrast.fb_height = height;
4956 dpsoftrast.fb_depthpixels = depthpixels;
4957 dpsoftrast.fb_colorpixels[0] = colorpixels;
4958 dpsoftrast.fb_colorpixels[1] = NULL;
4959 dpsoftrast.fb_colorpixels[1] = NULL;
4960 dpsoftrast.fb_colorpixels[1] = NULL;
4961 dpsoftrast.viewport[0] = 0;
4962 dpsoftrast.viewport[1] = 0;
4963 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4964 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4965 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4966 dpsoftrast.texture_firstfree = 1;
4967 dpsoftrast.texture_end = 1;
4968 dpsoftrast.texture_max = 0;
4969 dpsoftrast.color[0] = 1;
4970 dpsoftrast.color[1] = 1;
4971 dpsoftrast.color[2] = 1;
4972 dpsoftrast.color[3] = 1;
4973 dpsoftrast.interlace = bound(0, interlace, 1);
4975 dpsoftrast.numthreads = bound(1, numthreads, 64);
4977 dpsoftrast.numthreads = 1;
4979 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4980 for (i = 0; i < dpsoftrast.numthreads; i++)
4982 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4984 thread->cullface = GL_BACK;
4985 thread->colormask[1] = 1;
4986 thread->colormask[2] = 1;
4987 thread->colormask[3] = 1;
4988 thread->blendfunc[0] = GL_ONE;
4989 thread->blendfunc[1] = GL_ZERO;
4990 thread->depthmask = true;
4991 thread->depthtest = true;
4992 thread->depthfunc = GL_LEQUAL;
4993 thread->scissortest = false;
4994 thread->alphatest = false;
4995 thread->alphafunc = GL_GREATER;
4996 thread->alphavalue = 0.5f;
4997 thread->viewport[0] = 0;
4998 thread->viewport[1] = 0;
4999 thread->viewport[2] = dpsoftrast.fb_width;
5000 thread->viewport[3] = dpsoftrast.fb_height;
5001 thread->scissor[0] = 0;
5002 thread->scissor[1] = 0;
5003 thread->scissor[2] = dpsoftrast.fb_width;
5004 thread->scissor[3] = dpsoftrast.fb_height;
5005 thread->depthrange[0] = 0;
5006 thread->depthrange[1] = 1;
5007 thread->polygonoffset[0] = 0;
5008 thread->polygonoffset[1] = 0;
5010 if (dpsoftrast.interlace)
5012 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5013 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5014 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5015 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5019 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5020 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5023 thread->numspans = 0;
5024 thread->numtriangles = 0;
5025 thread->commandoffset = 0;
5026 thread->waiting = false;
5027 thread->starving = false;
5029 thread->waitcond = SDL_CreateCond();
5030 thread->drawcond = SDL_CreateCond();
5031 thread->drawmutex = SDL_CreateMutex();
5034 thread->validate = -1;
5035 DPSOFTRAST_Validate(thread, -1);
5037 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5042 void DPSOFTRAST_Shutdown(void)
5046 if (dpsoftrast.numthreads > 0)
5048 DPSOFTRAST_State_Thread *thread;
5049 for (i = 0; i < dpsoftrast.numthreads; i++)
5051 thread = &dpsoftrast.threads[i];
5052 SDL_LockMutex(thread->drawmutex);
5054 SDL_CondSignal(thread->drawcond);
5055 SDL_UnlockMutex(thread->drawmutex);
5056 SDL_WaitThread(thread->thread, NULL);
5057 SDL_DestroyCond(thread->waitcond);
5058 SDL_DestroyCond(thread->drawcond);
5059 SDL_DestroyMutex(thread->drawmutex);
5063 for (i = 0;i < dpsoftrast.texture_end;i++)
5064 if (dpsoftrast.texture[i].bytes)
5065 MM_FREE(dpsoftrast.texture[i].bytes);
5066 if (dpsoftrast.texture)
5067 free(dpsoftrast.texture);
5068 if (dpsoftrast.threads)
5069 MM_FREE(dpsoftrast.threads);
5070 memset(&dpsoftrast, 0, sizeof(dpsoftrast));