3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
18 #define ALIGN(var) var __attribute__((__aligned__(16)))
19 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20 #define MEMORY_BARRIER (_mm_sfence())
21 //(__sync_synchronize())
22 #define ATOMIC_COUNTER volatile int
23 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
31 #define ATOMIC_COUNTER volatile LONG
32 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
39 #define ALIGN(var) var
42 #define ATOMIC(var) var
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
61 #include <emmintrin.h>
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
65 static void *MM_CALLOC(size_t nmemb, size_t size)
67 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68 if (ptr != NULL) memset(ptr, 0, nmemb*size);
72 #define MM_FREE _mm_free
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
79 typedef enum DPSOFTRAST_ARRAY_e
81 DPSOFTRAST_ARRAY_POSITION,
82 DPSOFTRAST_ARRAY_COLOR,
83 DPSOFTRAST_ARRAY_TEXCOORD0,
84 DPSOFTRAST_ARRAY_TEXCOORD1,
85 DPSOFTRAST_ARRAY_TEXCOORD2,
86 DPSOFTRAST_ARRAY_TEXCOORD3,
87 DPSOFTRAST_ARRAY_TEXCOORD4,
88 DPSOFTRAST_ARRAY_TEXCOORD5,
89 DPSOFTRAST_ARRAY_TEXCOORD6,
90 DPSOFTRAST_ARRAY_TEXCOORD7,
91 DPSOFTRAST_ARRAY_TOTAL
95 typedef struct DPSOFTRAST_Texture_s
102 DPSOFTRAST_TEXTURE_FILTER filter;
105 ATOMIC_COUNTER binds;
106 unsigned char *bytes;
107 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
116 unsigned char opcode;
117 unsigned short commandsize;
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
123 #define DEFCOMMAND(opcodeval, name, fields) \
124 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
127 unsigned char opcode; \
128 unsigned short commandsize; \
130 } DPSOFTRAST_Command_##name );
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
139 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
141 DPSOFTRAST_State_Command_Pool);
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
145 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
147 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
149 DPSOFTRAST_State_Triangle);
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
172 int triangle; // triangle this span was generated by
173 int x; // framebuffer x coord
174 int y; // framebuffer y coord
175 int startx; // usable range (according to pixelmask)
176 int endx; // usable range (according to pixelmask)
177 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
179 DPSOFTRAST_State_Span);
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
189 typedef enum DPSOFTRAST_BLENDMODE_e
191 DPSOFTRAST_BLENDMODE_OPAQUE,
192 DPSOFTRAST_BLENDMODE_ALPHA,
193 DPSOFTRAST_BLENDMODE_ADDALPHA,
194 DPSOFTRAST_BLENDMODE_ADD,
195 DPSOFTRAST_BLENDMODE_INVMOD,
196 DPSOFTRAST_BLENDMODE_MUL,
197 DPSOFTRAST_BLENDMODE_MUL2,
198 DPSOFTRAST_BLENDMODE_SUBALPHA,
199 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200 DPSOFTRAST_BLENDMODE_INVADD,
201 DPSOFTRAST_BLENDMODE_TOTAL
203 DPSOFTRAST_BLENDMODE;
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
224 float polygonoffset[2];
227 int shader_permutation;
229 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
231 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
234 // DPSOFTRAST_VALIDATE_ flags
237 // derived values (DPSOFTRAST_VALIDATE_FB)
240 ALIGN(float fb_viewportcenter[4]);
241 ALIGN(float fb_viewportscale[4]);
243 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
246 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255 ATOMIC(volatile int commandoffset);
257 volatile bool waiting;
258 volatile bool starving;
265 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 DPSOFTRAST_State_Thread);
270 typedef ATOMIC(struct DPSOFTRAST_State_s
274 unsigned int *fb_depthpixels;
275 unsigned int *fb_colorpixels[4];
278 ALIGN(float fb_viewportcenter[4]);
279 ALIGN(float fb_viewportscale[4]);
282 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285 const float *pointer_vertex3f;
286 const float *pointer_color4f;
287 const unsigned char *pointer_color4ub;
288 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
291 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
297 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298 float *screencoord4f;
304 int shader_permutation;
308 int texture_firstfree;
309 DPSOFTRAST_Texture *texture;
314 const char *errorstring;
319 DPSOFTRAST_State_Thread *threads;
321 ATOMIC(volatile int drawcommand);
323 DPSOFTRAST_State_Command_Pool commandpool;
327 DPSOFTRAST_State dpsoftrast;
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
337 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339 fb_viewportcenter[3] = 0.5f;
340 fb_viewportcenter[0] = 0.0f;
341 fb_viewportscale[1] = 0.5f * viewport[2];
342 fb_viewportscale[2] = -0.5f * viewport[3];
343 fb_viewportscale[3] = 0.5f;
344 fb_viewportscale[0] = 1.0f;
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
349 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350 // and viewport projection values
353 x1 = thread->scissor[0];
354 x2 = thread->scissor[0] + thread->scissor[2];
355 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356 y2 = dpsoftrast.fb_height - thread->scissor[1];
357 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
359 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
361 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362 thread->fb_scissor[0] = x1;
363 thread->fb_scissor[1] = y1;
364 thread->fb_scissor[2] = x2 - x1;
365 thread->fb_scissor[3] = y2 - y1;
367 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
372 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
377 if (thread->blendsubtract)
379 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
381 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
389 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
410 mask &= thread->validate;
413 if (mask & DPSOFTRAST_VALIDATE_FB)
415 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416 DPSOFTRAST_RecalcFB(thread);
418 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
420 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421 DPSOFTRAST_RecalcDepthFunc(thread);
423 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
425 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426 DPSOFTRAST_RecalcBlendFunc(thread);
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
432 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433 return &dpsoftrast.texture[index];
437 static void DPSOFTRAST_Texture_Grow(void)
439 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440 DPSOFTRAST_State_Thread *thread;
444 // expand texture array as needed
445 if (dpsoftrast.texture_max < 1024)
446 dpsoftrast.texture_max = 1024;
448 dpsoftrast.texture_max *= 2;
449 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451 if (dpsoftrast.texbound[i])
452 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453 for (j = 0; j < dpsoftrast.numthreads; j++)
455 thread = &dpsoftrast.threads[j];
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (thread->texbound[i])
458 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
471 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473 DPSOFTRAST_Texture *texture;
474 if (width*height*depth < 1)
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
481 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
486 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
493 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
503 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
508 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
510 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
515 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533 // find first empty slot in texture array
534 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535 if (!dpsoftrast.texture[texnum].bytes)
537 dpsoftrast.texture_firstfree = texnum + 1;
538 if (dpsoftrast.texture_max <= texnum)
539 DPSOFTRAST_Texture_Grow();
540 if (dpsoftrast.texture_end <= texnum)
541 dpsoftrast.texture_end = texnum + 1;
542 texture = &dpsoftrast.texture[texnum];
543 memset(texture, 0, sizeof(*texture));
544 texture->flags = flags;
545 texture->width = width;
546 texture->height = height;
547 texture->depth = depth;
548 texture->sides = sides;
560 s = w * h * d * sides * 4;
561 texture->mipmap[mipmaps][0] = size;
562 texture->mipmap[mipmaps][1] = s;
563 texture->mipmap[mipmaps][2] = w;
564 texture->mipmap[mipmaps][3] = h;
565 texture->mipmap[mipmaps][4] = d;
568 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 texture->mipmaps = mipmaps;
575 texture->size = size;
577 // allocate the pixels now
578 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
582 void DPSOFTRAST_Texture_Free(int index)
584 DPSOFTRAST_Texture *texture;
585 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
589 MM_FREE(texture->bytes);
590 texture->bytes = NULL;
591 memset(texture, 0, sizeof(*texture));
592 // adjust the free range and used range
593 if (dpsoftrast.texture_firstfree > index)
594 dpsoftrast.texture_firstfree = index;
595 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596 dpsoftrast.texture_end--;
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
600 int i, x, y, z, w, layer0, layer1, row0, row1;
601 unsigned char *o, *i0, *i1, *i2, *i3;
602 DPSOFTRAST_Texture *texture;
603 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604 if (texture->mipmaps <= 1)
606 for (i = 1;i < texture->mipmaps;i++)
608 for (z = 0;z < texture->mipmap[i][4];z++)
612 if (layer1 >= texture->mipmap[i-1][4])
613 layer1 = texture->mipmap[i-1][4]-1;
614 for (y = 0;y < texture->mipmap[i][3];y++)
618 if (row1 >= texture->mipmap[i-1][3])
619 row1 = texture->mipmap[i-1][3]-1;
620 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
621 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625 w = texture->mipmap[i][2];
628 if (texture->mipmap[i-1][2] > 1)
630 // average 3D texture
631 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
633 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
641 // average 3D mipmap with parent width == 1
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
644 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
653 if (texture->mipmap[i-1][2] > 1)
655 // average 2D texture (common case)
656 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
658 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
666 // 2D texture with parent width == 1
667 o[0] = (i0[0] + i1[0] + 1) >> 1;
668 o[1] = (i0[1] + i1[1] + 1) >> 1;
669 o[2] = (i0[2] + i1[2] + 1) >> 1;
670 o[3] = (i0[3] + i1[3] + 1) >> 1;
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
679 DPSOFTRAST_Texture *texture;
681 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685 while (blockheight > 0)
687 memcpy(dst, pixels, blockwidth * 4);
688 pixels += blockwidth * 4;
689 dst += texture->mipmap[0][2] * 4;
692 DPSOFTRAST_Texture_CalculateMipmaps(index);
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
696 DPSOFTRAST_Texture *texture;
697 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707 return texture->mipmap[mip][2];
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][3];
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][4];
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727 return texture->bytes + texture->mipmap[mip][0];
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
735 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
740 texture->filter = filter;
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
745 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
749 dpsoftrast.fb_width = width;
750 dpsoftrast.fb_height = height;
751 dpsoftrast.fb_depthpixels = depthpixels;
752 dpsoftrast.fb_colorpixels[0] = colorpixels0;
753 dpsoftrast.fb_colorpixels[1] = colorpixels1;
754 dpsoftrast.fb_colorpixels[2] = colorpixels2;
755 dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 static void DPSOFTRAST_Draw_FlushThreads(void);
760 static void DPSOFTRAST_Draw_SyncCommands(void)
762 if(dpsoftrast.usethreads) MEMORY_BARRIER;
763 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 Thread_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) Thread_CondSignal(thread->drawcond);
800 Thread_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 Thread_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
815 DPSOFTRAST_Command *command;
816 int freecommand = dpsoftrast.commandpool.freecommand;
817 int usedcommands = dpsoftrast.commandpool.usedcommands;
818 int extra = sizeof(DPSOFTRAST_Command);
819 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
823 if (dpsoftrast.usethreads)
824 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
826 DPSOFTRAST_Draw_FlushThreads();
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
854 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855 usedcommands -= size;
856 dpsoftrast.commandpool.freecommand = freecommand;
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 thread->viewport[0] = command->x;
864 thread->viewport[1] = command->y;
865 thread->viewport[2] = command->width;
866 thread->viewport[3] = command->height;
867 thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 int i, x1, y1, x2, y2, w, h, x, y;
888 int miny1 = thread->miny1;
889 int maxy1 = thread->maxy1;
890 int miny2 = thread->miny2;
891 int maxy2 = thread->maxy2;
895 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896 x1 = thread->fb_scissor[0];
897 y1 = thread->fb_scissor[1];
898 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900 if (y1 < miny1) y1 = miny1;
901 if (y2 > maxy2) y2 = maxy2;
906 // FIXME: honor fb_colormask?
907 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908 for (i = 0;i < 4;i++)
910 if (!dpsoftrast.fb_colorpixels[i])
912 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
915 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916 for (x = x1;x < x2;x++)
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
923 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
933 int x1, y1, x2, y2, w, h, x, y;
934 int miny1 = thread->miny1;
935 int maxy1 = thread->maxy1;
936 int miny2 = thread->miny2;
937 int maxy2 = thread->maxy2;
941 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942 x1 = thread->fb_scissor[0];
943 y1 = thread->fb_scissor[1];
944 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946 if (y1 < miny1) y1 = miny1;
947 if (y2 > maxy2) y2 = maxy2;
952 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
956 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearDepth(float d)
963 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
970 thread->colormask[0] = command->r != 0;
971 thread->colormask[1] = command->g != 0;
972 thread->colormask[2] = command->b != 0;
973 thread->colormask[3] = command->a != 0;
974 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
978 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
988 thread->depthtest = command->enable;
989 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
991 void DPSOFTRAST_DepthTest(int enable)
993 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994 command->enable = enable;
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1000 thread->scissortest = command->enable;
1001 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1003 void DPSOFTRAST_ScissorTest(int enable)
1005 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006 command->enable = enable;
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1012 thread->scissor[0] = command->x;
1013 thread->scissor[1] = command->y;
1014 thread->scissor[2] = command->width;
1015 thread->scissor[3] = command->height;
1016 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1020 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1023 command->width = width;
1024 command->height = height;
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1030 thread->blendfunc[0] = command->sfactor;
1031 thread->blendfunc[1] = command->dfactor;
1032 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1036 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037 command->sfactor = sfactor;
1038 command->dfactor = dfactor;
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1044 thread->blendsubtract = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1047 void DPSOFTRAST_BlendSubtract(int enable)
1049 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050 command->enable = enable;
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1056 thread->depthmask = command->enable;
1058 void DPSOFTRAST_DepthMask(int enable)
1060 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061 command->enable = enable;
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1067 thread->depthfunc = command->func;
1069 void DPSOFTRAST_DepthFunc(int func)
1071 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072 command->func = func;
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1078 thread->depthrange[0] = command->nearval;
1079 thread->depthrange[1] = command->farval;
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1083 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084 command->nearval = nearval;
1085 command->farval = farval;
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1091 thread->polygonoffset[0] = command->alongnormal;
1092 thread->polygonoffset[1] = command->intoview;
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1096 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097 command->alongnormal = alongnormal;
1098 command->intoview = intoview;
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1104 thread->cullface = command->mode;
1106 void DPSOFTRAST_CullFace(int mode)
1108 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109 command->mode = mode;
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1115 thread->alphatest = command->enable;
1117 void DPSOFTRAST_AlphaTest(int enable)
1119 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120 command->enable = enable;
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1126 thread->alphafunc = command->func;
1127 thread->alphavalue = command->ref;
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1131 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132 command->func = func;
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1138 dpsoftrast.color[0] = r;
1139 dpsoftrast.color[1] = g;
1140 dpsoftrast.color[2] = b;
1141 dpsoftrast.color[3] = a;
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1146 int outstride = blockwidth * 4;
1147 int instride = dpsoftrast.fb_width * 4;
1150 int bx2 = blockx + blockwidth;
1151 int by2 = blocky + blockheight;
1155 unsigned char *inpixels;
1159 if (bx1 < 0) bx1 = 0;
1160 if (by1 < 0) by1 = 0;
1161 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1162 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1164 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1165 if (dpsoftrast.bigendian)
1167 for (y = by1;y < by2;y++)
1169 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1170 o = (unsigned char *)outpixels + (y - by1) * outstride;
1171 for (x = bx1;x < bx2;x++)
1184 for (y = by1;y < by2;y++)
1186 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1187 o = (unsigned char *)outpixels + (y - by1) * outstride;
1193 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1197 int tx2 = tx + width;
1198 int ty2 = ty + height;
1201 int sx2 = sx + width;
1202 int sy2 = sy + height;
1212 unsigned int *spixels;
1213 unsigned int *tpixels;
1214 DPSOFTRAST_Texture *texture;
1215 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1216 if (mip < 0 || mip >= texture->mipmaps) return;
1218 spixels = dpsoftrast.fb_colorpixels[0];
1219 swidth = dpsoftrast.fb_width;
1220 sheight = dpsoftrast.fb_height;
1221 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1222 twidth = texture->mipmap[mip][2];
1223 theight = texture->mipmap[mip][3];
1224 if (tx1 < 0) tx1 = 0;
1225 if (ty1 < 0) ty1 = 0;
1226 if (tx2 > twidth) tx2 = twidth;
1227 if (ty2 > theight) ty2 = theight;
1228 if (sx1 < 0) sx1 = 0;
1229 if (sy1 < 0) sy1 = 0;
1230 if (sx2 > swidth) sx2 = swidth;
1231 if (sy2 > sheight) sy2 = sheight;
1236 if (tw > sw) tw = sw;
1237 if (th > sh) th = sh;
1238 if (tw < 1 || th < 1)
1240 for (y = 0;y < th;y++)
1241 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1242 if (texture->mipmaps > 1)
1243 DPSOFTRAST_Texture_CalculateMipmaps(index);
1246 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1247 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1249 if (thread->texbound[command->unitnum])
1250 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1251 thread->texbound[command->unitnum] = command->texture;
1253 void DPSOFTRAST_SetTexture(int unitnum, int index)
1255 DPSOFTRAST_Command_SetTexture *command;
1256 DPSOFTRAST_Texture *texture;
1257 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1259 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1262 texture = DPSOFTRAST_Texture_GetByIndex(index);
1263 if (index && !texture)
1265 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1269 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1270 command->unitnum = unitnum;
1271 command->texture = texture;
1273 dpsoftrast.texbound[unitnum] = texture;
1274 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1277 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1279 dpsoftrast.pointer_vertex3f = vertex3f;
1280 dpsoftrast.stride_vertex = stride;
1282 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1284 dpsoftrast.pointer_color4f = color4f;
1285 dpsoftrast.pointer_color4ub = NULL;
1286 dpsoftrast.stride_color = stride;
1288 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1290 dpsoftrast.pointer_color4f = NULL;
1291 dpsoftrast.pointer_color4ub = color4ub;
1292 dpsoftrast.stride_color = stride;
1294 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1296 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1297 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1298 dpsoftrast.stride_texcoord[unitnum] = stride;
1301 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1302 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1304 thread->shader_mode = command->mode;
1305 thread->shader_permutation = command->permutation;
1307 void DPSOFTRAST_SetShader(int mode, int permutation)
1309 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1310 command->mode = mode;
1311 command->permutation = permutation;
1313 dpsoftrast.shader_mode = mode;
1314 dpsoftrast.shader_permutation = permutation;
1317 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1318 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1320 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1322 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1324 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1325 command->index = index;
1326 command->val[0] = v0;
1327 command->val[1] = v1;
1328 command->val[2] = v2;
1329 command->val[3] = v3;
1331 dpsoftrast.uniform4f[index*4+0] = v0;
1332 dpsoftrast.uniform4f[index*4+1] = v1;
1333 dpsoftrast.uniform4f[index*4+2] = v2;
1334 dpsoftrast.uniform4f[index*4+3] = v3;
1336 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1338 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1339 command->index = index;
1340 memcpy(command->val, v, sizeof(command->val));
1342 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1345 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1346 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1348 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1350 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1354 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1356 __m128 m0, m1, m2, m3;
1357 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1358 command->index = (DPSOFTRAST_UNIFORM)index;
1359 if (((size_t)v)&(ALIGN_SIZE-1))
1361 m0 = _mm_loadu_ps(v);
1362 m1 = _mm_loadu_ps(v+4);
1363 m2 = _mm_loadu_ps(v+8);
1364 m3 = _mm_loadu_ps(v+12);
1368 m0 = _mm_load_ps(v);
1369 m1 = _mm_load_ps(v+4);
1370 m2 = _mm_load_ps(v+8);
1371 m3 = _mm_load_ps(v+12);
1375 __m128 t0, t1, t2, t3;
1376 t0 = _mm_unpacklo_ps(m0, m1);
1377 t1 = _mm_unpacklo_ps(m2, m3);
1378 t2 = _mm_unpackhi_ps(m0, m1);
1379 t3 = _mm_unpackhi_ps(m2, m3);
1380 m0 = _mm_movelh_ps(t0, t1);
1381 m1 = _mm_movehl_ps(t1, t0);
1382 m2 = _mm_movelh_ps(t2, t3);
1383 m3 = _mm_movehl_ps(t3, t2);
1385 _mm_store_ps(command->val, m0);
1386 _mm_store_ps(command->val+4, m1);
1387 _mm_store_ps(command->val+8, m2);
1388 _mm_store_ps(command->val+12, m3);
1389 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1390 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1391 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1392 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1397 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1398 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1400 thread->uniform1i[command->index] = command->val;
1402 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1404 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1405 command->index = index;
1408 dpsoftrast.uniform1i[command->index] = i0;
1412 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1414 float *end = dst + size*4;
1415 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1419 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1428 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1435 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1437 float *end = dst + size*4;
1438 if (stride == sizeof(float[3]))
1440 float *end4 = dst + (size&~3)*4;
1441 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1445 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1446 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1447 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1450 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1453 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1454 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1455 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1456 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1457 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 src += 4*sizeof(float[3]);
1466 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1467 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1468 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1471 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1474 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1475 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1476 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 src += 4*sizeof(float[3]);
1484 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1488 __m128 v = _mm_loadu_ps((const float *)src);
1489 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1490 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1491 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1492 _mm_store_ps(dst, v);
1501 __m128 v = _mm_load_ps((const float *)src);
1502 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1503 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1504 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1505 _mm_store_ps(dst, v);
1512 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1514 float *end = dst + size*4;
1515 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1516 if (stride == sizeof(float[2]))
1518 float *end2 = dst + (size&~1)*4;
1519 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1523 __m128 v = _mm_loadu_ps((const float *)src);
1524 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1525 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1527 src += 2*sizeof(float[2]);
1534 __m128 v = _mm_load_ps((const float *)src);
1535 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1536 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1538 src += 2*sizeof(float[2]);
1544 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1550 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1552 float *end = dst + size*4;
1553 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1554 if (stride == sizeof(unsigned char[4]))
1556 float *end4 = dst + (size&~3)*4;
1557 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1561 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1562 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1563 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1564 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1565 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1567 src += 4*sizeof(unsigned char[4]);
1574 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1575 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1576 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1577 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1578 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1580 src += 4*sizeof(unsigned char[4]);
1586 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1587 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1593 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1595 float *end = dst + 4*size;
1596 __m128 v = _mm_loadu_ps(src);
1599 _mm_store_ps(dst, v);
1605 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1608 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1609 __m128 m0, m1, m2, m3;
1611 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1613 // fast case for identity matrix
1614 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1617 end = out4f + numitems*4;
1618 m0 = _mm_loadu_ps(inmatrix16f);
1619 m1 = _mm_loadu_ps(inmatrix16f + 4);
1620 m2 = _mm_loadu_ps(inmatrix16f + 8);
1621 m3 = _mm_loadu_ps(inmatrix16f + 12);
1622 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1626 __m128 v = _mm_loadu_ps(in4f);
1628 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1629 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1630 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1631 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1640 __m128 v = _mm_load_ps(in4f);
1642 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1643 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1644 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1645 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1653 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1655 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1659 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1661 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1662 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1663 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1664 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1667 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1669 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1670 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1671 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1672 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1678 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1679 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1680 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1681 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1684 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1686 int clipmask = 0xFF;
1687 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1688 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1689 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1690 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1691 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1692 #define BBFRONT(k, pos) \
1694 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1695 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1699 clipmask &= ~(1<<k); \
1700 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1701 minproj = _mm_min_ss(minproj, proj); \
1702 maxproj = _mm_max_ss(maxproj, proj); \
1706 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1707 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1708 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1709 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1710 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1711 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1715 if (clipmask&(1<<k)) \
1717 if (!(clipmask&(1<<(k^1)))) \
1719 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1720 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1721 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1722 minproj = _mm_min_ss(minproj, proj); \
1723 maxproj = _mm_max_ss(maxproj, proj); \
1725 if (!(clipmask&(1<<(k^2)))) \
1727 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1728 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1729 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1730 minproj = _mm_min_ss(minproj, proj); \
1731 maxproj = _mm_max_ss(maxproj, proj); \
1733 if (!(clipmask&(1<<(k^4)))) \
1735 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1736 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1737 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1738 minproj = _mm_min_ss(minproj, proj); \
1739 maxproj = _mm_max_ss(maxproj, proj); \
1743 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1744 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1745 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1746 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1747 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1748 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1749 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1750 *starty = _mm_cvttss_si32(maxproj);
1751 *endy = _mm_cvttss_si32(minproj)+1;
1755 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1757 float *end = out4f + numitems*4;
1758 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1759 __m128 minpos, maxpos;
1760 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1762 minpos = maxpos = _mm_loadu_ps(in4f);
1765 __m128 v = _mm_loadu_ps(in4f);
1766 minpos = _mm_min_ps(minpos, v);
1767 maxpos = _mm_max_ps(maxpos, v);
1768 _mm_store_ps(out4f, v);
1769 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1770 _mm_store_ps(screen4f, v);
1778 minpos = maxpos = _mm_load_ps(in4f);
1781 __m128 v = _mm_load_ps(in4f);
1782 minpos = _mm_min_ps(minpos, v);
1783 maxpos = _mm_max_ps(maxpos, v);
1784 _mm_store_ps(out4f, v);
1785 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1786 _mm_store_ps(screen4f, v);
1793 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1794 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1795 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1796 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1797 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1801 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1803 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1806 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808 end = out4f + numitems*4;
1809 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811 m0 = _mm_loadu_ps(inmatrix16f);
1812 m1 = _mm_loadu_ps(inmatrix16f + 4);
1813 m2 = _mm_loadu_ps(inmatrix16f + 8);
1814 m3 = _mm_loadu_ps(inmatrix16f + 12);
1815 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1817 minpos = maxpos = _mm_loadu_ps(in4f);
1820 __m128 v = _mm_loadu_ps(in4f);
1821 minpos = _mm_min_ps(minpos, v);
1822 maxpos = _mm_max_ps(maxpos, v);
1823 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824 _mm_store_ps(out4f, v);
1825 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826 _mm_store_ps(screen4f, v);
1834 minpos = maxpos = _mm_load_ps(in4f);
1837 __m128 v = _mm_load_ps(in4f);
1838 minpos = _mm_min_ps(minpos, v);
1839 maxpos = _mm_max_ps(maxpos, v);
1840 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1850 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1858 float *outf = dpsoftrast.post_array4f[outarray];
1859 const unsigned char *inb;
1860 int firstvertex = dpsoftrast.firstvertex;
1861 int numvertices = dpsoftrast.numvertices;
1865 case DPSOFTRAST_ARRAY_POSITION:
1866 stride = dpsoftrast.stride_vertex;
1867 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1868 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1870 case DPSOFTRAST_ARRAY_COLOR:
1871 stride = dpsoftrast.stride_color;
1872 if (dpsoftrast.pointer_color4f)
1874 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1875 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1877 else if (dpsoftrast.pointer_color4ub)
1879 stride = dpsoftrast.stride_color;
1880 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1881 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1885 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1889 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1890 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1892 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1893 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1896 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1899 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1902 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1914 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1916 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1917 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1922 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1925 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1934 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1937 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1938 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1945 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 int startx = span->startx;
1949 int endx = span->endx;
1950 float wslope = triangle->w[0];
1951 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1952 float endz = 1.0f / (w + wslope * startx);
1953 for (x = startx;x < endx;)
1955 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1957 if (nextsub >= endx) nextsub = endsub = endx-1;
1958 endz = 1.0f / (w + wslope * nextsub);
1959 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1960 for (; x <= endsub; x++, z += dz)
1965 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 int startx = span->startx;
1969 int endx = span->endx;
1972 unsigned char * RESTRICT pixelmask = span->pixelmask;
1973 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1977 // handle alphatest now (this affects depth writes too)
1978 if (thread->alphatest)
1979 for (x = startx;x < endx;x++)
1980 if (in4f[x*4+3] < 0.5f)
1981 pixelmask[x] = false;
1982 // FIXME: this does not handle bigendian
1983 switch(thread->fb_blendmode)
1985 case DPSOFTRAST_BLENDMODE_OPAQUE:
1986 for (x = startx;x < endx;x++)
1990 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1991 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1992 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1993 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1994 pixel[x*4+0] = d[0];
1995 pixel[x*4+1] = d[1];
1996 pixel[x*4+2] = d[2];
1997 pixel[x*4+3] = d[3];
2000 case DPSOFTRAST_BLENDMODE_ALPHA:
2001 for (x = startx;x < endx;x++)
2005 a = in4f[x*4+3] * 255.0f;
2006 b = 1.0f - in4f[x*4+3];
2007 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2008 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2009 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2010 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2011 pixel[x*4+0] = d[0];
2012 pixel[x*4+1] = d[1];
2013 pixel[x*4+2] = d[2];
2014 pixel[x*4+3] = d[3];
2017 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2018 for (x = startx;x < endx;x++)
2022 a = in4f[x*4+3] * 255.0f;
2023 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027 pixel[x*4+0] = d[0];
2028 pixel[x*4+1] = d[1];
2029 pixel[x*4+2] = d[2];
2030 pixel[x*4+3] = d[3];
2033 case DPSOFTRAST_BLENDMODE_ADD:
2034 for (x = startx;x < endx;x++)
2038 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042 pixel[x*4+0] = d[0];
2043 pixel[x*4+1] = d[1];
2044 pixel[x*4+2] = d[2];
2045 pixel[x*4+3] = d[3];
2048 case DPSOFTRAST_BLENDMODE_INVMOD:
2049 for (x = startx;x < endx;x++)
2053 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_MUL:
2064 for (x = startx;x < endx;x++)
2068 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072 pixel[x*4+0] = d[0];
2073 pixel[x*4+1] = d[1];
2074 pixel[x*4+2] = d[2];
2075 pixel[x*4+3] = d[3];
2078 case DPSOFTRAST_BLENDMODE_MUL2:
2079 for (x = startx;x < endx;x++)
2083 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2084 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2085 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2086 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2087 pixel[x*4+0] = d[0];
2088 pixel[x*4+1] = d[1];
2089 pixel[x*4+2] = d[2];
2090 pixel[x*4+3] = d[3];
2093 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2094 for (x = startx;x < endx;x++)
2098 a = in4f[x*4+3] * -255.0f;
2099 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2100 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2101 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2102 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2103 pixel[x*4+0] = d[0];
2104 pixel[x*4+1] = d[1];
2105 pixel[x*4+2] = d[2];
2106 pixel[x*4+3] = d[3];
2109 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2110 for (x = startx;x < endx;x++)
2115 b = 1.0f - in4f[x*4+3];
2116 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2117 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2118 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2119 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2120 pixel[x*4+0] = d[0];
2121 pixel[x*4+1] = d[1];
2122 pixel[x*4+2] = d[2];
2123 pixel[x*4+3] = d[3];
2126 case DPSOFTRAST_BLENDMODE_INVADD:
2127 for (x = startx;x < endx;x++)
2131 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2132 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2133 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2134 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2135 pixel[x*4+0] = d[0];
2136 pixel[x*4+1] = d[1];
2137 pixel[x*4+2] = d[2];
2138 pixel[x*4+3] = d[3];
2144 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2148 int startx = span->startx;
2149 int endx = span->endx;
2150 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2151 unsigned char * RESTRICT pixelmask = span->pixelmask;
2152 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2153 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2156 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2157 pixeli += span->y * dpsoftrast.fb_width + span->x;
2158 // handle alphatest now (this affects depth writes too)
2159 if (thread->alphatest)
2160 for (x = startx;x < endx;x++)
2161 if (in4ub[x*4+3] < 0.5f)
2162 pixelmask[x] = false;
2163 // FIXME: this does not handle bigendian
2164 switch(thread->fb_blendmode)
2166 case DPSOFTRAST_BLENDMODE_OPAQUE:
2167 for (x = startx;x + 4 <= endx;)
2169 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2171 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2185 case DPSOFTRAST_BLENDMODE_ALPHA:
2186 #define FINISHBLEND(blend2, blend1) \
2187 for (x = startx;x + 1 < endx;x += 2) \
2190 switch (*(const unsigned short*)&pixelmask[x]) \
2193 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2194 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2196 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2199 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2200 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2202 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2205 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2208 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2213 for(;x < endx; x++) \
2216 if (!pixelmask[x]) \
2218 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2221 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2228 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241 case DPSOFTRAST_BLENDMODE_ADD:
2242 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2244 case DPSOFTRAST_BLENDMODE_INVMOD:
2246 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 case DPSOFTRAST_BLENDMODE_MUL:
2252 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2254 case DPSOFTRAST_BLENDMODE_MUL2:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2257 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2259 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2262 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275 case DPSOFTRAST_BLENDMODE_INVADD:
2277 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2286 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2289 int startx = span->startx;
2290 int endx = span->endx;
2295 float tc[2], endtc[2];
2297 unsigned int tci[2];
2298 unsigned int tci1[2];
2299 unsigned int tcimin[2];
2300 unsigned int tcimax[2];
2305 const unsigned char * RESTRICT pixelbase;
2306 const unsigned char * RESTRICT pixel[4];
2307 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2308 // if no texture is bound, just fill it with white
2311 for (x = startx;x < endx;x++)
2313 out4f[x*4+0] = 1.0f;
2314 out4f[x*4+1] = 1.0f;
2315 out4f[x*4+2] = 1.0f;
2316 out4f[x*4+3] = 1.0f;
2320 mip = triangle->mip[texunitindex];
2321 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322 // if this mipmap of the texture is 1 pixel, just fill it with that color
2323 if (texture->mipmap[mip][1] == 4)
2325 c[0] = texture->bytes[2] * (1.0f/255.0f);
2326 c[1] = texture->bytes[1] * (1.0f/255.0f);
2327 c[2] = texture->bytes[0] * (1.0f/255.0f);
2328 c[3] = texture->bytes[3] * (1.0f/255.0f);
2329 for (x = startx;x < endx;x++)
2331 out4f[x*4+0] = c[0];
2332 out4f[x*4+1] = c[1];
2333 out4f[x*4+2] = c[2];
2334 out4f[x*4+3] = c[3];
2338 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2339 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2340 flags = texture->flags;
2341 tcscale[0] = texture->mipmap[mip][2];
2342 tcscale[1] = texture->mipmap[mip][3];
2343 tciwidth = texture->mipmap[mip][2];
2346 tcimax[0] = texture->mipmap[mip][2]-1;
2347 tcimax[1] = texture->mipmap[mip][3]-1;
2348 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2349 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2350 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2351 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2352 for (x = startx;x < endx;)
2354 unsigned int subtc[2];
2355 unsigned int substep[2];
2356 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2357 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2358 if (nextsub >= endx)
2360 nextsub = endsub = endx-1;
2361 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2365 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2366 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2367 substep[0] = (endtc[0] - tc[0]) * subscale;
2368 substep[1] = (endtc[1] - tc[1]) * subscale;
2369 subtc[0] = tc[0] * (1<<16);
2370 subtc[1] = tc[1] * (1<<16);
2373 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2375 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2377 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2378 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2379 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2380 tci[0] = subtc[0]>>16;
2381 tci[1] = subtc[1]>>16;
2382 tci1[0] = tci[0] + 1;
2383 tci1[1] = tci[1] + 1;
2384 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2385 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2386 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2387 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2388 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2389 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2390 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2391 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2392 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2393 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2394 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2396 out4f[x*4+0] = c[0];
2397 out4f[x*4+1] = c[1];
2398 out4f[x*4+2] = c[2];
2399 out4f[x*4+3] = c[3];
2404 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2406 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2407 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2408 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2409 tci[0] = subtc[0]>>16;
2410 tci[1] = subtc[1]>>16;
2411 tci1[0] = tci[0] + 1;
2412 tci1[1] = tci[1] + 1;
2413 tci[0] &= tciwrapmask[0];
2414 tci[1] &= tciwrapmask[1];
2415 tci1[0] &= tciwrapmask[0];
2416 tci1[1] &= tciwrapmask[1];
2417 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2419 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2420 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2421 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2422 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2423 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2425 out4f[x*4+0] = c[0];
2426 out4f[x*4+1] = c[1];
2427 out4f[x*4+2] = c[2];
2428 out4f[x*4+3] = c[3];
2432 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2434 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2436 tci[0] = subtc[0]>>16;
2437 tci[1] = subtc[1]>>16;
2438 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2439 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2440 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441 c[0] = pixel[0][2] * (1.0f / 255.0f);
2442 c[1] = pixel[0][1] * (1.0f / 255.0f);
2443 c[2] = pixel[0][0] * (1.0f / 255.0f);
2444 c[3] = pixel[0][3] * (1.0f / 255.0f);
2445 out4f[x*4+0] = c[0];
2446 out4f[x*4+1] = c[1];
2447 out4f[x*4+2] = c[2];
2448 out4f[x*4+3] = c[3];
2453 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455 tci[0] = subtc[0]>>16;
2456 tci[1] = subtc[1]>>16;
2457 tci[0] &= tciwrapmask[0];
2458 tci[1] &= tciwrapmask[1];
2459 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2473 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2477 int startx = span->startx;
2478 int endx = span->endx;
2480 __m128 data, slope, tcscale;
2481 __m128i tcsize, tcmask, tcoffset, tcmax;
2483 __m128i subtc, substep, endsubtc;
2486 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2487 const unsigned char * RESTRICT pixelbase;
2488 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2489 // if no texture is bound, just fill it with white
2492 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2495 mip = triangle->mip[texunitindex];
2496 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2497 // if this mipmap of the texture is 1 pixel, just fill it with that color
2498 if (texture->mipmap[mip][1] == 4)
2500 unsigned int k = *((const unsigned int *)pixelbase);
2501 for (x = startx;x < endx;x++)
2505 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2506 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2507 flags = texture->flags;
2508 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2509 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2510 tcscale = _mm_cvtepi32_ps(tcsize);
2511 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2512 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2513 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2514 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2515 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2516 tcmax = _mm_packs_epi32(tcmask, tcmask);
2517 for (x = startx;x < endx;)
2519 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2520 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2521 if (nextsub >= endx)
2523 nextsub = endsub = endx-1;
2524 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2528 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2529 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2530 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2531 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2532 substep = _mm_slli_epi32(substep, 1);
2535 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2536 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2538 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2539 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2541 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2542 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2543 tci = _mm_madd_epi16(tci, tcoffset);
2544 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2545 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2546 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2547 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2548 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2549 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2550 fracm = _mm_srli_epi16(subtc, 1);
2551 pix1 = _mm_add_epi16(pix1,
2552 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2553 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2554 pix3 = _mm_add_epi16(pix3,
2555 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2556 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2557 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2558 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2559 pix2 = _mm_add_epi16(pix2,
2560 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2561 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2562 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2566 const unsigned char * RESTRICT ptr1;
2567 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2568 tci = _mm_madd_epi16(tci, tcoffset);
2569 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2570 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2571 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2572 fracm = _mm_srli_epi16(subtc, 1);
2573 pix1 = _mm_add_epi16(pix1,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2577 pix1 = _mm_add_epi16(pix1,
2578 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2580 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2584 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2586 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2588 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2589 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590 tci = _mm_madd_epi16(tci, tcoffset);
2591 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593 _mm_setzero_si128());
2594 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596 _mm_setzero_si128());
2597 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2598 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2599 tci = _mm_madd_epi16(tci, tcoffset);
2600 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602 _mm_setzero_si128());
2603 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605 _mm_setzero_si128());
2606 fracm = _mm_srli_epi16(subtc, 1);
2607 pix1 = _mm_add_epi16(pix1,
2608 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2610 pix3 = _mm_add_epi16(pix3,
2611 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2612 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2613 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2614 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2615 pix2 = _mm_add_epi16(pix2,
2616 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2617 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2618 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2622 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2623 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2624 tci = _mm_madd_epi16(tci, tcoffset);
2625 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627 _mm_setzero_si128());
2628 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630 _mm_setzero_si128());
2631 fracm = _mm_srli_epi16(subtc, 1);
2632 pix1 = _mm_add_epi16(pix1,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2636 pix1 = _mm_add_epi16(pix1,
2637 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2638 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2639 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2645 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2647 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2648 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649 tci = _mm_madd_epi16(tci, tcoffset);
2650 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652 _mm_setzero_si128());
2653 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655 _mm_setzero_si128());
2656 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2657 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661 _mm_setzero_si128());
2662 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664 _mm_setzero_si128());
2665 fracm = _mm_srli_epi16(subtc, 1);
2666 pix1 = _mm_add_epi16(pix1,
2667 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2669 pix3 = _mm_add_epi16(pix3,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2671 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2672 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2673 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2674 pix2 = _mm_add_epi16(pix2,
2675 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2676 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2677 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2681 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2682 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2683 tci = _mm_madd_epi16(tci, tcoffset);
2684 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2685 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2686 _mm_setzero_si128());
2687 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2688 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2689 _mm_setzero_si128());
2690 fracm = _mm_srli_epi16(subtc, 1);
2691 pix1 = _mm_add_epi16(pix1,
2692 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2693 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2694 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2695 pix1 = _mm_add_epi16(pix1,
2696 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2697 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2698 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2705 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2707 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2709 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2710 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2711 tci = _mm_madd_epi16(tci, tcoffset);
2712 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2717 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2718 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2719 tci = _mm_madd_epi16(tci, tcoffset);
2720 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2726 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729 tci = _mm_and_si128(tci, tcmax);
2730 tci = _mm_madd_epi16(tci, tcoffset);
2731 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2736 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737 tci = _mm_and_si128(tci, tcmax);
2738 tci = _mm_madd_epi16(tci, tcoffset);
2739 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2748 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2751 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2754 float DPSOFTRAST_SampleShadowmap(const float *vector)
2760 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2763 int startx = span->startx;
2764 int endx = span->endx;
2769 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2770 for (x = startx;x < endx;x++)
2773 c[0] = (data[0] + slope[0]*x) * z;
2774 c[1] = (data[1] + slope[1]*x) * z;
2775 c[2] = (data[2] + slope[2]*x) * z;
2776 c[3] = (data[3] + slope[3]*x) * z;
2777 out4f[x*4+0] = in4f[x*4+0] * c[0];
2778 out4f[x*4+1] = in4f[x*4+1] * c[1];
2779 out4f[x*4+2] = in4f[x*4+2] * c[2];
2780 out4f[x*4+3] = in4f[x*4+3] * c[3];
2784 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2787 int startx = span->startx;
2788 int endx = span->endx;
2793 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2794 for (x = startx;x < endx;x++)
2797 c[0] = (data[0] + slope[0]*x) * z;
2798 c[1] = (data[1] + slope[1]*x) * z;
2799 c[2] = (data[2] + slope[2]*x) * z;
2800 c[3] = (data[3] + slope[3]*x) * z;
2801 out4f[x*4+0] = c[0];
2802 out4f[x*4+1] = c[1];
2803 out4f[x*4+2] = c[2];
2804 out4f[x*4+3] = c[3];
2808 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2810 int x, startx = span->startx, endx = span->endx;
2811 float c[4], localcolor[4];
2812 localcolor[0] = subcolor[0];
2813 localcolor[1] = subcolor[1];
2814 localcolor[2] = subcolor[2];
2815 localcolor[3] = subcolor[3];
2816 for (x = startx;x < endx;x++)
2818 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2819 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2820 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2821 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2822 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2823 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2824 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2825 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2829 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2831 int x, startx = span->startx, endx = span->endx;
2832 for (x = startx;x < endx;x++)
2834 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2835 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2836 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2837 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2841 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2843 int x, startx = span->startx, endx = span->endx;
2844 for (x = startx;x < endx;x++)
2846 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2847 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2848 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2849 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2853 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2855 int x, startx = span->startx, endx = span->endx;
2857 for (x = startx;x < endx;x++)
2859 a = 1.0f - inb4f[x*4+3];
2861 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2862 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2863 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2864 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2868 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2870 int x, startx = span->startx, endx = span->endx;
2871 float localcolor[4], ilerp, lerp;
2872 localcolor[0] = color[0];
2873 localcolor[1] = color[1];
2874 localcolor[2] = color[2];
2875 localcolor[3] = color[3];
2876 ilerp = 1.0f - localcolor[3];
2877 lerp = localcolor[3];
2878 for (x = startx;x < endx;x++)
2880 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2881 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2882 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2883 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2889 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2893 int startx = span->startx;
2894 int endx = span->endx;
2897 __m128i submod, substep, endsubmod;
2898 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2899 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2900 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2901 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2902 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2903 for (x = startx; x < endx;)
2905 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2906 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2907 if (nextsub >= endx)
2909 nextsub = endsub = endx-1;
2910 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2914 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2915 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2916 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2917 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2918 substep = _mm_packs_epi32(substep, substep);
2919 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2921 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2922 pix = _mm_mulhi_epu16(pix, submod);
2923 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2927 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2928 pix = _mm_mulhi_epu16(pix, submod);
2929 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2936 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2940 int startx = span->startx;
2941 int endx = span->endx;
2944 __m128i submod, substep, endsubmod;
2945 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2946 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2947 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2948 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2949 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2950 for (x = startx; x < endx;)
2952 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2953 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2954 if (nextsub >= endx)
2956 nextsub = endsub = endx-1;
2957 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2961 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2962 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2963 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2964 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2965 substep = _mm_packs_epi32(substep, substep);
2966 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2968 __m128i pix = _mm_srai_epi16(submod, 4);
2969 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2973 __m128i pix = _mm_srai_epi16(submod, 4);
2974 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2981 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2984 int x, startx = span->startx, endx = span->endx;
2985 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2986 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2987 for (x = startx;x+2 <= endx;x+=2)
2989 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2990 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2991 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2992 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2996 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2997 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2998 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2999 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3004 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3007 int x, startx = span->startx, endx = span->endx;
3008 for (x = startx;x+2 <= endx;x+=2)
3010 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3011 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3012 pix1 = _mm_mulhi_epu16(pix1, pix2);
3013 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3017 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3018 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3019 pix1 = _mm_mulhi_epu16(pix1, pix2);
3020 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3025 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3028 int x, startx = span->startx, endx = span->endx;
3029 for (x = startx;x+2 <= endx;x+=2)
3031 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3032 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3033 pix1 = _mm_add_epi16(pix1, pix2);
3034 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3038 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3039 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3040 pix1 = _mm_add_epi16(pix1, pix2);
3041 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3046 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3049 int x, startx = span->startx, endx = span->endx;
3050 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3051 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3052 for (x = startx;x+2 <= endx;x+=2)
3054 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3056 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3057 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3061 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3063 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3064 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3069 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3072 int x, startx = span->startx, endx = span->endx;
3073 for (x = startx;x+2 <= endx;x+=2)
3075 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3076 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3077 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3078 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3079 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3083 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3084 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3085 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3086 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3087 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3092 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3095 int x, startx = span->startx, endx = span->endx;
3096 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3097 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3098 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3099 for (x = startx;x+2 <= endx;x+=2)
3101 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3102 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3103 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3107 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3108 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3109 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3116 void DPSOFTRAST_VertexShader_Generic(void)
3118 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3119 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3120 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3121 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3125 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3127 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3128 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3129 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3132 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3134 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3135 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3136 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3138 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3139 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3142 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3144 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3147 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3149 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3152 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3157 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3158 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3163 void DPSOFTRAST_VertexShader_PostProcess(void)
3165 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3166 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3167 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3170 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3172 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3173 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3174 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3177 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3178 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3180 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3181 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3183 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3184 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3186 // TODO: implement saturation
3188 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3190 // TODO: implement gammaramps
3192 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3197 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3199 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3202 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3204 // this is never called (because colormask is off when this shader is used)
3205 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3206 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3208 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3209 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3214 void DPSOFTRAST_VertexShader_FlatColor(void)
3216 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3220 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 unsigned char * RESTRICT pixelmask = span->pixelmask;
3224 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3225 int x, startx = span->startx, endx = span->endx;
3226 __m128i Color_Ambientm;
3227 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3228 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233 pixel = buffer_FragColorbgra8;
3234 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238 for (x = startx;x < endx;x++)
3241 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3244 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3245 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3246 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3247 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3253 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3254 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3255 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3257 if (pixel == buffer_FragColorbgra8)
3258 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3264 void DPSOFTRAST_VertexShader_VertexColor(void)
3266 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3268 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3271 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 unsigned char * RESTRICT pixelmask = span->pixelmask;
3275 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3276 int x, startx = span->startx, endx = span->endx;
3277 __m128i Color_Ambientm, Color_Diffusem;
3279 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3280 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3281 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3283 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286 pixel = buffer_FragColorbgra8;
3287 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3294 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3295 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3296 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3297 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3298 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3299 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3300 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3302 __m128i color, mod, pix;
3303 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3306 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3307 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3308 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3309 data = _mm_add_ps(data, slope);
3310 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3311 data = _mm_add_ps(data, slope);
3312 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3313 data = _mm_add_ps(data, slope);
3314 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3315 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3316 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3317 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3318 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3319 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3325 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3326 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3327 mod = _mm_packs_epi32(mod, mod);
3328 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3329 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3331 if (pixel == buffer_FragColorbgra8)
3332 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3338 void DPSOFTRAST_VertexShader_Lightmap(void)
3340 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3341 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3342 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3345 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3348 unsigned char * RESTRICT pixelmask = span->pixelmask;
3349 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3350 int x, startx = span->startx, endx = span->endx;
3351 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3352 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3355 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3359 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3360 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3361 pixel = buffer_FragColorbgra8;
3362 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3363 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3364 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3365 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3366 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3369 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3371 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3372 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3375 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3376 for (x = startx;x < endx;x++)
3378 __m128i color, lightmap, glow, pix;
3379 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3382 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3383 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3384 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3385 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3386 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3387 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3388 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3389 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3390 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3391 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3397 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3398 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3399 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3400 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3401 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3402 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3407 for (x = startx;x < endx;x++)
3409 __m128i color, lightmap, pix;
3410 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3414 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3415 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3416 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3417 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3419 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3425 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3426 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3427 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3428 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3431 if (pixel == buffer_FragColorbgra8)
3432 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3438 void DPSOFTRAST_VertexShader_FakeLight(void)
3440 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3443 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3446 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3447 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3448 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3449 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3450 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3455 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3457 DPSOFTRAST_VertexShader_Lightmap();
3460 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3462 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3468 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3470 DPSOFTRAST_VertexShader_Lightmap();
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3475 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3481 void DPSOFTRAST_VertexShader_LightDirection(void)
3484 int numvertices = dpsoftrast.numvertices;
3486 float LightVector[4];
3487 float EyePosition[4];
3488 float EyeVectorModelSpace[4];
3494 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3495 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3496 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3497 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3498 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3499 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3500 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3501 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3502 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3503 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3504 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3505 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3506 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3507 for (i = 0;i < numvertices;i++)
3509 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3510 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3511 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3512 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3513 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3514 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3515 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3516 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3517 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3518 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3519 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3520 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3521 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3522 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3523 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3524 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3525 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3526 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3528 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3529 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3530 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3531 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3532 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3533 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3534 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3535 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3539 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3542 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3543 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3544 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3545 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3546 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3547 #define DPSOFTRAST_Vector3Normalize(v)\
3550 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3561 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3563 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3564 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3566 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 int x, startx = span->startx, endx = span->endx;
3572 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3573 float LightVectordata[4];
3574 float LightVectorslope[4];
3575 float EyeVectordata[4];
3576 float EyeVectorslope[4];
3578 float diffusetex[4];
3580 float surfacenormal[4];
3581 float lightnormal[4];
3583 float specularnormal[4];
3586 float SpecularPower;
3588 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3589 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3590 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3591 Color_Glow[3] = 0.0f;
3592 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3593 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3594 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3595 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3596 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3597 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3598 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3599 Color_Pants[3] = 0.0f;
3600 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3601 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3602 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3603 Color_Shirt[3] = 0.0f;
3604 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3605 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3606 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3608 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3613 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3617 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3618 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3619 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3620 Color_Diffuse[3] = 0.0f;
3621 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3622 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3623 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3624 LightColor[3] = 0.0f;
3625 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630 Color_Specular[3] = 0.0f;
3631 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3633 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634 for (x = startx;x < endx;x++)
3637 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3638 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3639 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3640 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3641 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3643 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3644 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3645 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3646 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3648 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3649 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3650 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3651 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3652 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3653 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3654 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3655 DPSOFTRAST_Vector3Normalize(surfacenormal);
3657 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3658 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3659 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3660 DPSOFTRAST_Vector3Normalize(lightnormal);
3662 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3663 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3664 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3665 DPSOFTRAST_Vector3Normalize(eyenormal);
3667 specularnormal[0] = lightnormal[0] + eyenormal[0];
3668 specularnormal[1] = lightnormal[1] + eyenormal[1];
3669 specularnormal[2] = lightnormal[2] + eyenormal[2];
3670 DPSOFTRAST_Vector3Normalize(specularnormal);
3672 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3673 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3674 specular = pow(specular, SpecularPower * glosstex[3]);
3675 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3677 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3678 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3679 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3680 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3684 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3685 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3686 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3687 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3689 buffer_FragColorbgra8[x*4+0] = d[0];
3690 buffer_FragColorbgra8[x*4+1] = d[1];
3691 buffer_FragColorbgra8[x*4+2] = d[2];
3692 buffer_FragColorbgra8[x*4+3] = d[3];
3695 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3697 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700 Color_Diffuse[3] = 0.0f;
3701 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3702 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3703 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3704 LightColor[3] = 0.0f;
3705 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3706 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3707 for (x = startx;x < endx;x++)
3710 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3711 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3712 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3713 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3714 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3717 DPSOFTRAST_Vector3Normalize(surfacenormal);
3719 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3720 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3721 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3722 DPSOFTRAST_Vector3Normalize(lightnormal);
3724 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3725 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3727 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3728 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3729 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3730 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3734 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3735 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3736 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3737 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3739 buffer_FragColorbgra8[x*4+0] = d[0];
3740 buffer_FragColorbgra8[x*4+1] = d[1];
3741 buffer_FragColorbgra8[x*4+2] = d[2];
3742 buffer_FragColorbgra8[x*4+3] = d[3];
3747 for (x = startx;x < endx;x++)
3750 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3751 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3752 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3753 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3755 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3757 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3758 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3759 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3760 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3764 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3765 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3766 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3767 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3769 buffer_FragColorbgra8[x*4+0] = d[0];
3770 buffer_FragColorbgra8[x*4+1] = d[1];
3771 buffer_FragColorbgra8[x*4+2] = d[2];
3772 buffer_FragColorbgra8[x*4+3] = d[3];
3775 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3780 void DPSOFTRAST_VertexShader_LightSource(void)
3783 int numvertices = dpsoftrast.numvertices;
3784 float LightPosition[4];
3785 float LightVector[4];
3786 float LightVectorModelSpace[4];
3787 float EyePosition[4];
3788 float EyeVectorModelSpace[4];
3794 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3795 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3796 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3797 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3798 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3799 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3800 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3801 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3802 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3803 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3804 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3805 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3806 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3807 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3808 for (i = 0;i < numvertices;i++)
3810 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3811 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3812 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3813 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3814 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3815 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3816 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3817 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3818 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3819 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3820 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3821 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3822 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3823 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3824 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3825 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3826 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3827 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3828 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3829 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3830 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3831 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3832 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3833 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3834 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3835 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3836 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3837 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3838 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3839 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3843 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3844 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3847 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3850 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3851 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3852 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3853 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858 int x, startx = span->startx, endx = span->endx;
3859 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3860 float CubeVectordata[4];
3861 float CubeVectorslope[4];
3862 float LightVectordata[4];
3863 float LightVectorslope[4];
3864 float EyeVectordata[4];
3865 float EyeVectorslope[4];
3867 float diffusetex[4];
3869 float surfacenormal[4];
3870 float lightnormal[4];
3872 float specularnormal[4];
3875 float SpecularPower;
3876 float CubeVector[4];
3879 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3880 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3881 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3882 Color_Glow[3] = 0.0f;
3883 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3884 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3885 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3886 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3887 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890 Color_Diffuse[3] = 0.0f;
3891 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3892 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3893 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3894 Color_Specular[3] = 0.0f;
3895 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3896 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3897 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3898 Color_Pants[3] = 0.0f;
3899 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3900 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3901 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3902 Color_Shirt[3] = 0.0f;
3903 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3904 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3905 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3906 LightColor[3] = 0.0f;
3907 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3908 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3909 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3911 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3912 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3913 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3914 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3916 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3920 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3921 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3923 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925 for (x = startx;x < endx;x++)
3928 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932 if (attenuation < 0.01f)
3934 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3936 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937 if (attenuation < 0.01f)
3941 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3947 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3952 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3953 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3954 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3955 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3956 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959 DPSOFTRAST_Vector3Normalize(surfacenormal);
3961 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3962 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3963 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3964 DPSOFTRAST_Vector3Normalize(lightnormal);
3966 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3967 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3968 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3969 DPSOFTRAST_Vector3Normalize(eyenormal);
3971 specularnormal[0] = lightnormal[0] + eyenormal[0];
3972 specularnormal[1] = lightnormal[1] + eyenormal[1];
3973 specularnormal[2] = lightnormal[2] + eyenormal[2];
3974 DPSOFTRAST_Vector3Normalize(specularnormal);
3976 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3978 specular = pow(specular, SpecularPower * glosstex[3]);
3979 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3981 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3982 attenuation *= (1.0f / 255.0f);
3983 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3984 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3985 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3986 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3990 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3991 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3992 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3993 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3995 buffer_FragColorbgra8[x*4+0] = d[0];
3996 buffer_FragColorbgra8[x*4+1] = d[1];
3997 buffer_FragColorbgra8[x*4+2] = d[2];
3998 buffer_FragColorbgra8[x*4+3] = d[3];
4001 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4003 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4004 for (x = startx;x < endx;x++)
4007 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4008 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4009 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4010 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4011 if (attenuation < 0.01f)
4013 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4015 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4016 if (attenuation < 0.01f)
4020 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4021 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4022 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4023 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4026 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4027 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4028 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4029 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4031 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4032 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4033 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4034 DPSOFTRAST_Vector3Normalize(surfacenormal);
4036 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4037 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4038 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4039 DPSOFTRAST_Vector3Normalize(lightnormal);
4041 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4042 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4044 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4045 attenuation *= (1.0f / 255.0f);
4046 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4047 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4048 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4049 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4053 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4054 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4055 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4056 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4058 buffer_FragColorbgra8[x*4+0] = d[0];
4059 buffer_FragColorbgra8[x*4+1] = d[1];
4060 buffer_FragColorbgra8[x*4+2] = d[2];
4061 buffer_FragColorbgra8[x*4+3] = d[3];
4066 for (x = startx;x < endx;x++)
4069 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4070 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4071 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4072 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4073 if (attenuation < 0.01f)
4075 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4077 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4078 if (attenuation < 0.01f)
4082 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4083 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4084 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4085 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4086 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4088 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4089 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4090 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4091 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4093 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4095 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096 attenuation *= (1.0f / 255.0f);
4097 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4104 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4105 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4106 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4107 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4109 buffer_FragColorbgra8[x*4+0] = d[0];
4110 buffer_FragColorbgra8[x*4+1] = d[1];
4111 buffer_FragColorbgra8[x*4+2] = d[2];
4112 buffer_FragColorbgra8[x*4+3] = d[3];
4115 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4121 void DPSOFTRAST_VertexShader_Refraction(void)
4123 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4138 void DPSOFTRAST_VertexShader_Water(void)
4140 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4147 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4148 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4150 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4151 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4156 void DPSOFTRAST_VertexShader_ShowDepth(void)
4158 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4161 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4164 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4165 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4166 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4167 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4173 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4178 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4181 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4182 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4183 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4184 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4185 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4190 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4192 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4195 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4198 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4202 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4207 typedef struct DPSOFTRAST_ShaderModeInfo_s
4210 void (*Vertex)(void);
4211 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4212 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4213 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4215 DPSOFTRAST_ShaderModeInfo;
4217 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4219 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4220 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4221 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4222 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4223 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4224 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4225 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4226 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4227 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4228 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4229 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4230 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4231 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4232 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4233 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4234 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4237 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4244 // unsigned int *colorpixel;
4245 unsigned int *depthpixel;
4251 DPSOFTRAST_State_Triangle *triangle;
4252 DPSOFTRAST_State_Span *span;
4253 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4254 for (i = 0; i < thread->numspans; i++)
4256 span = &thread->spans[i];
4257 triangle = &thread->triangles[span->triangle];
4258 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4260 wslope = triangle->w[0];
4261 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4262 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4263 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4264 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4265 startx = span->startx;
4267 switch(thread->fb_depthfunc)
4270 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4271 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4272 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4273 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4274 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4275 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4276 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4278 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4279 //for (x = startx;x < endx;x++)
4280 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4281 // if there is no color buffer, skip pixel shader
4282 while (startx < endx && !pixelmask[startx])
4284 while (endx > startx && !pixelmask[endx-1])
4287 continue; // no pixels to fill
4288 span->pixelmask = pixelmask;
4289 span->startx = startx;
4291 // run pixel shader if appropriate
4292 // do this before running depthmask code, to allow the pixelshader
4293 // to clear pixelmask values for alpha testing
4294 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4295 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4296 if (thread->depthmask)
4297 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4303 // no depth testing means we're just dealing with color...
4304 // if there is no color buffer, skip pixel shader
4305 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4307 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4308 span->pixelmask = pixelmask;
4309 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4313 thread->numspans = 0;
4316 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4318 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4321 int cullface = thread->cullface;
4322 int minx, maxx, miny, maxy;
4323 int miny1, maxy1, miny2, maxy2;
4324 __m128i fbmin, fbmax;
4325 __m128 viewportcenter, viewportscale;
4326 int firstvertex = command->firstvertex;
4327 int numvertices = command->numvertices;
4328 int numtriangles = command->numtriangles;
4329 const int *element3i = command->element3i;
4330 const unsigned short *element3s = command->element3s;
4331 int clipped = command->clipped;
4338 int starty, endy, bandy;
4342 __m128 triangleedge1, triangleedge2, trianglenormal;
4345 DPSOFTRAST_State_Triangle *triangle;
4346 DPSOFTRAST_Texture *texture;
4347 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4348 miny = thread->fb_scissor[1];
4349 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4350 miny1 = bound(miny, thread->miny1, maxy);
4351 maxy1 = bound(miny, thread->maxy1, maxy);
4352 miny2 = bound(miny, thread->miny2, maxy);
4353 maxy2 = bound(miny, thread->maxy2, maxy);
4354 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4356 if (!ATOMIC_DECREMENT(command->refcount))
4358 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4359 MM_FREE(command->arrays);
4363 minx = thread->fb_scissor[0];
4364 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4365 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4366 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4367 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4368 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4369 screen[3] = _mm_setzero_ps();
4370 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4371 for (i = 0;i < numtriangles;i++)
4373 const float *screencoord4f = command->arrays;
4374 const float *arrays = screencoord4f + numvertices*4;
4376 // generate the 3 edges of this triangle
4377 // generate spans for the triangle - switch based on left split or right split classification of triangle
4380 e[0] = element3s[i*3+0] - firstvertex;
4381 e[1] = element3s[i*3+1] - firstvertex;
4382 e[2] = element3s[i*3+2] - firstvertex;
4386 e[0] = element3i[i*3+0] - firstvertex;
4387 e[1] = element3i[i*3+1] - firstvertex;
4388 e[2] = element3i[i*3+2] - firstvertex;
4397 #define SKIPBACKFACE \
4398 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4399 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4400 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4401 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4402 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4406 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4410 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4415 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4416 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4418 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4419 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4421 #define CLIPPEDVERTEXCOPY(k,p1) \
4422 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4424 #define GENATTRIBCOPY(attrib, p1) \
4425 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4426 #define GENATTRIBLERP(attrib, p1, p2) \
4428 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4429 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4431 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4435 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4436 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4437 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4438 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4439 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4440 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4441 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4447 // calculate distance from nearplane
4448 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4449 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4450 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4451 if (clipdist[0] >= 0.0f)
4453 if (clipdist[1] >= 0.0f)
4455 if (clipdist[2] >= 0.0f)
4458 // triangle is entirely in front of nearplane
4459 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4466 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4474 if (clipdist[2] >= 0.0f)
4476 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4483 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4490 else if (clipdist[1] >= 0.0f)
4492 if (clipdist[2] >= 0.0f)
4494 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4501 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4507 else if (clipdist[2] >= 0.0f)
4509 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4514 else continue; // triangle is entirely behind nearplane
4517 // calculate integer y coords for triangle points
4518 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4519 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4520 screenmin = _mm_min_epi16(screeni, screenir),
4521 screenmax = _mm_max_epi16(screeni, screenir);
4522 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4523 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4524 screenmin = _mm_max_epi16(screenmin, fbmin);
4525 screenmax = _mm_min_epi16(screenmax, fbmax);
4526 // skip offscreen triangles
4527 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4529 starty = _mm_extract_epi16(screenmin, 1);
4530 endy = _mm_extract_epi16(screenmax, 1)+1;
4531 if (starty >= maxy1 && endy <= miny2)
4533 screeny = _mm_srai_epi32(screeni, 16);
4536 triangle = &thread->triangles[thread->numtriangles];
4538 // calculate attribute plans for triangle data...
4539 // okay, this triangle is going to produce spans, we'd better project
4540 // the interpolants now (this is what gives perspective texturing),
4541 // this consists of simply multiplying all arrays by the W coord
4542 // (which is basically 1/Z), which will be undone per-pixel
4543 // (multiplying by Z again) to get the perspective-correct array
4546 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4547 __m128 mipedgescale, mipdensity;
4548 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4549 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4550 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4551 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4552 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4553 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4554 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4555 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4556 attribedge1 = _mm_sub_ss(w0, w1);
4557 attribedge2 = _mm_sub_ss(w2, w1);
4558 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4559 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4560 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4561 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4562 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4563 _mm_store_ss(&triangle->w[0], attribxslope);
4564 _mm_store_ss(&triangle->w[1], attribyslope);
4565 _mm_store_ss(&triangle->w[2], attriborigin);
4566 mipedgescale = _mm_setzero_ps();
4567 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4569 __m128 attrib0, attrib1, attrib2;
4570 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4571 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4573 arrays += numvertices*4;
4574 GENATTRIBS(attrib0, attrib1, attrib2);
4575 attriborigin = _mm_mul_ps(attrib1, w1);
4576 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4577 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4578 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4579 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4580 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4581 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4582 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4583 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4584 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4586 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4587 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4588 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4589 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4593 memset(triangle->mip, 0, sizeof(triangle->mip));
4594 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4596 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4597 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4599 texture = thread->texbound[texunit];
4600 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4602 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4603 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4604 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4605 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4606 // this will be multiplied in the texturing routine by the texture resolution
4607 y = _mm_cvtss_si32(mipdensity);
4610 y = (int)(log((float)y)*0.5f/M_LN2);
4611 if (y > texture->mipmaps - 1)
4612 y = texture->mipmaps - 1;
4613 triangle->mip[texunit] = y;
4619 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4622 __m128 xcoords, xslope;
4623 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4624 int yccmask = _mm_movemask_epi8(ycc);
4625 int edge0p, edge0n, edge1p, edge1n;
4632 case 0xFFFF: /*0000*/ y = endy; continue;
4633 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4634 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4635 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4636 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4637 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4638 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4639 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4640 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4641 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4642 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4643 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4644 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4645 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4646 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4647 case 0x0000: /*1111*/ y++; continue;
4655 case 0xFFFF: /*000*/ y = endy; continue;
4656 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4657 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4658 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4659 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4660 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4661 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4662 case 0x0000: /*111*/ y++; continue;
4665 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4666 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4667 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4668 nexty = _mm_extract_epi16(ycc, 0);
4669 if (nexty >= bandy) nexty = bandy-1;
4670 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4671 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4672 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4673 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4674 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4675 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4677 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4678 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4680 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4682 int startx, endx, offset;
4683 startx = _mm_cvtss_si32(xcoords);
4684 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4687 if (startx < 0) startx = 0;
4688 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4690 if (endx > maxx) endx = maxx;
4691 if (startx >= endx) continue;
4692 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4694 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4695 span->triangle = thread->numtriangles;
4698 span->startx = max(minx - offset, 0);
4699 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4700 if (span->startx >= span->endx)
4702 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4703 DPSOFTRAST_Draw_ProcessSpans(thread);
4708 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4710 DPSOFTRAST_Draw_ProcessSpans(thread);
4711 thread->numtriangles = 0;
4715 if (!ATOMIC_DECREMENT(command->refcount))
4717 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4718 MM_FREE(command->arrays);
4721 if (thread->numspans > 0 || thread->numtriangles > 0)
4723 DPSOFTRAST_Draw_ProcessSpans(thread);
4724 thread->numtriangles = 0;
4729 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4733 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4734 int datasize = 2*numvertices*sizeof(float[4]);
4735 DPSOFTRAST_Command_Draw *command;
4736 unsigned char *data;
4737 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4739 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4740 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4742 datasize += numvertices*sizeof(float[4]);
4745 datasize += numtriangles*sizeof(unsigned short[3]);
4747 datasize += numtriangles*sizeof(int[3]);
4748 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4749 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4751 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4752 data = (unsigned char *)MM_CALLOC(datasize, 1);
4756 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4757 data = (unsigned char *)command + commandsize;
4759 command->firstvertex = firstvertex;
4760 command->numvertices = numvertices;
4761 command->numtriangles = numtriangles;
4762 command->arrays = (float *)data;
4763 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4764 dpsoftrast.firstvertex = firstvertex;
4765 dpsoftrast.numvertices = numvertices;
4766 dpsoftrast.screencoord4f = (float *)data;
4767 data += numvertices*sizeof(float[4]);
4768 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4769 data += numvertices*sizeof(float[4]);
4770 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4772 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4773 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4775 dpsoftrast.post_array4f[j] = (float *)data;
4776 data += numvertices*sizeof(float[4]);
4778 command->element3i = NULL;
4779 command->element3s = NULL;
4782 command->element3s = (unsigned short *)data;
4783 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4787 command->element3i = (int *)data;
4788 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4793 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4795 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4796 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4797 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4798 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4799 if (command->starty >= command->endy)
4801 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4802 MM_FREE(command->arrays);
4803 DPSOFTRAST_UndoCommand(command->commandsize);
4806 command->clipped = dpsoftrast.drawclipped;
4807 command->refcount = dpsoftrast.numthreads;
4809 if (dpsoftrast.usethreads)
4812 DPSOFTRAST_Draw_SyncCommands();
4813 for (i = 0; i < dpsoftrast.numthreads; i++)
4815 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4816 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4817 Thread_CondSignal(thread->drawcond);
4822 DPSOFTRAST_Draw_FlushThreads();
4826 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4828 int commandoffset = thread->commandoffset;
4829 while (commandoffset != endoffset)
4831 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4832 switch (command->opcode)
4834 #define INTERPCOMMAND(name) \
4835 case DPSOFTRAST_OPCODE_##name : \
4836 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4837 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4838 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4839 commandoffset = 0; \
4841 INTERPCOMMAND(Viewport)
4842 INTERPCOMMAND(ClearColor)
4843 INTERPCOMMAND(ClearDepth)
4844 INTERPCOMMAND(ColorMask)
4845 INTERPCOMMAND(DepthTest)
4846 INTERPCOMMAND(ScissorTest)
4847 INTERPCOMMAND(Scissor)
4848 INTERPCOMMAND(BlendFunc)
4849 INTERPCOMMAND(BlendSubtract)
4850 INTERPCOMMAND(DepthMask)
4851 INTERPCOMMAND(DepthFunc)
4852 INTERPCOMMAND(DepthRange)
4853 INTERPCOMMAND(PolygonOffset)
4854 INTERPCOMMAND(CullFace)
4855 INTERPCOMMAND(AlphaTest)
4856 INTERPCOMMAND(AlphaFunc)
4857 INTERPCOMMAND(SetTexture)
4858 INTERPCOMMAND(SetShader)
4859 INTERPCOMMAND(Uniform4f)
4860 INTERPCOMMAND(UniformMatrix4f)
4861 INTERPCOMMAND(Uniform1i)
4863 case DPSOFTRAST_OPCODE_Draw:
4864 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4865 commandoffset += command->commandsize;
4866 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4868 thread->commandoffset = commandoffset;
4871 case DPSOFTRAST_OPCODE_Reset:
4876 thread->commandoffset = commandoffset;
4879 static int DPSOFTRAST_Draw_Thread(void *data)
4881 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4882 while(thread->index >= 0)
4884 if (thread->commandoffset != dpsoftrast.drawcommand)
4886 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4890 Thread_LockMutex(thread->drawmutex);
4891 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4893 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4894 thread->starving = true;
4895 Thread_CondWait(thread->drawcond, thread->drawmutex);
4896 thread->starving = false;
4898 Thread_UnlockMutex(thread->drawmutex);
4904 static void DPSOFTRAST_Draw_FlushThreads(void)
4906 DPSOFTRAST_State_Thread *thread;
4908 DPSOFTRAST_Draw_SyncCommands();
4909 if (dpsoftrast.usethreads)
4911 for (i = 0; i < dpsoftrast.numthreads; i++)
4913 thread = &dpsoftrast.threads[i];
4914 if (thread->commandoffset != dpsoftrast.drawcommand)
4916 Thread_LockMutex(thread->drawmutex);
4917 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4918 Thread_CondSignal(thread->drawcond);
4919 Thread_UnlockMutex(thread->drawmutex);
4922 for (i = 0; i < dpsoftrast.numthreads; i++)
4924 thread = &dpsoftrast.threads[i];
4925 if (thread->commandoffset != dpsoftrast.drawcommand)
4927 Thread_LockMutex(thread->drawmutex);
4928 if (thread->commandoffset != dpsoftrast.drawcommand)
4930 thread->waiting = true;
4931 Thread_CondWait(thread->waitcond, thread->drawmutex);
4932 thread->waiting = false;
4934 Thread_UnlockMutex(thread->drawmutex);
4940 for (i = 0; i < dpsoftrast.numthreads; i++)
4942 thread = &dpsoftrast.threads[i];
4943 if (thread->commandoffset != dpsoftrast.drawcommand)
4944 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4947 dpsoftrast.commandpool.usedcommands = 0;
4950 void DPSOFTRAST_Flush(void)
4952 DPSOFTRAST_Draw_FlushThreads();
4955 void DPSOFTRAST_Finish(void)
4960 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4970 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4971 dpsoftrast.bigendian = u.b[3];
4972 dpsoftrast.fb_width = width;
4973 dpsoftrast.fb_height = height;
4974 dpsoftrast.fb_depthpixels = depthpixels;
4975 dpsoftrast.fb_colorpixels[0] = colorpixels;
4976 dpsoftrast.fb_colorpixels[1] = NULL;
4977 dpsoftrast.fb_colorpixels[1] = NULL;
4978 dpsoftrast.fb_colorpixels[1] = NULL;
4979 dpsoftrast.viewport[0] = 0;
4980 dpsoftrast.viewport[1] = 0;
4981 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4982 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4983 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4984 dpsoftrast.texture_firstfree = 1;
4985 dpsoftrast.texture_end = 1;
4986 dpsoftrast.texture_max = 0;
4987 dpsoftrast.color[0] = 1;
4988 dpsoftrast.color[1] = 1;
4989 dpsoftrast.color[2] = 1;
4990 dpsoftrast.color[3] = 1;
4991 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4992 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4993 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4994 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4995 for (i = 0; i < dpsoftrast.numthreads; i++)
4997 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4999 thread->cullface = GL_BACK;
5000 thread->colormask[1] = 1;
5001 thread->colormask[2] = 1;
5002 thread->colormask[3] = 1;
5003 thread->blendfunc[0] = GL_ONE;
5004 thread->blendfunc[1] = GL_ZERO;
5005 thread->depthmask = true;
5006 thread->depthtest = true;
5007 thread->depthfunc = GL_LEQUAL;
5008 thread->scissortest = false;
5009 thread->alphatest = false;
5010 thread->alphafunc = GL_GREATER;
5011 thread->alphavalue = 0.5f;
5012 thread->viewport[0] = 0;
5013 thread->viewport[1] = 0;
5014 thread->viewport[2] = dpsoftrast.fb_width;
5015 thread->viewport[3] = dpsoftrast.fb_height;
5016 thread->scissor[0] = 0;
5017 thread->scissor[1] = 0;
5018 thread->scissor[2] = dpsoftrast.fb_width;
5019 thread->scissor[3] = dpsoftrast.fb_height;
5020 thread->depthrange[0] = 0;
5021 thread->depthrange[1] = 1;
5022 thread->polygonoffset[0] = 0;
5023 thread->polygonoffset[1] = 0;
5025 if (dpsoftrast.interlace)
5027 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5028 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5029 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5030 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5034 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5035 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5038 thread->numspans = 0;
5039 thread->numtriangles = 0;
5040 thread->commandoffset = 0;
5041 thread->waiting = false;
5042 thread->starving = false;
5044 thread->validate = -1;
5045 DPSOFTRAST_Validate(thread, -1);
5047 if (dpsoftrast.usethreads)
5049 thread->waitcond = Thread_CreateCond();
5050 thread->drawcond = Thread_CreateCond();
5051 thread->drawmutex = Thread_CreateMutex();
5052 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5058 void DPSOFTRAST_Shutdown(void)
5061 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5063 DPSOFTRAST_State_Thread *thread;
5064 for (i = 0; i < dpsoftrast.numthreads; i++)
5066 thread = &dpsoftrast.threads[i];
5067 Thread_LockMutex(thread->drawmutex);
5069 Thread_CondSignal(thread->drawcond);
5070 Thread_UnlockMutex(thread->drawmutex);
5071 Thread_WaitThread(thread->thread, 0);
5072 Thread_DestroyCond(thread->waitcond);
5073 Thread_DestroyCond(thread->drawcond);
5074 Thread_DestroyMutex(thread->drawmutex);
5077 for (i = 0;i < dpsoftrast.texture_end;i++)
5078 if (dpsoftrast.texture[i].bytes)
5079 MM_FREE(dpsoftrast.texture[i].bytes);
5080 if (dpsoftrast.texture)
5081 free(dpsoftrast.texture);
5082 if (dpsoftrast.threads)
5083 MM_FREE(dpsoftrast.threads);
5084 memset(&dpsoftrast, 0, sizeof(dpsoftrast));