3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
237 int shader_exactspecularmath;
239 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
241 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
244 // DPSOFTRAST_VALIDATE_ flags
247 // derived values (DPSOFTRAST_VALIDATE_FB)
250 ALIGN(float fb_viewportcenter[4]);
251 ALIGN(float fb_viewportscale[4]);
253 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
256 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
265 ATOMIC(volatile int commandoffset);
267 volatile bool waiting;
268 volatile bool starving;
275 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
278 DPSOFTRAST_State_Thread);
280 typedef ATOMIC(struct DPSOFTRAST_State_s
284 unsigned int *fb_depthpixels;
285 unsigned int *fb_colorpixels[4];
288 ALIGN(float fb_viewportcenter[4]);
289 ALIGN(float fb_viewportscale[4]);
292 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
295 const float *pointer_vertex3f;
296 const float *pointer_color4f;
297 const unsigned char *pointer_color4ub;
298 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
307 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308 float *screencoord4f;
314 int shader_permutation;
315 int shader_exactspecularmath;
319 int texture_firstfree;
320 DPSOFTRAST_Texture *texture;
325 const char *errorstring;
330 DPSOFTRAST_State_Thread *threads;
332 ATOMIC(volatile int drawcommand);
334 DPSOFTRAST_State_Command_Pool commandpool;
338 DPSOFTRAST_State dpsoftrast;
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
348 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350 fb_viewportcenter[3] = 0.5f;
351 fb_viewportcenter[0] = 0.0f;
352 fb_viewportscale[1] = 0.5f * viewport[2];
353 fb_viewportscale[2] = -0.5f * viewport[3];
354 fb_viewportscale[3] = 0.5f;
355 fb_viewportscale[0] = 1.0f;
358 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
360 if (dpsoftrast.interlace)
362 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
363 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
364 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
365 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
370 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
376 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
377 // and viewport projection values
380 x1 = thread->scissor[0];
381 x2 = thread->scissor[0] + thread->scissor[2];
382 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
383 y2 = dpsoftrast.fb_height - thread->scissor[1];
384 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
386 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
388 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
389 thread->fb_scissor[0] = x1;
390 thread->fb_scissor[1] = y1;
391 thread->fb_scissor[2] = x2 - x1;
392 thread->fb_scissor[3] = y2 - y1;
394 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
395 DPSOFTRAST_RecalcThread(thread);
398 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
400 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
403 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
405 if (thread->blendsubtract)
407 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
409 #define BLENDFUNC(sfactor, dfactor, blendmode) \
410 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
411 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
412 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
419 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
420 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
421 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
422 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
423 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
424 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
425 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
426 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
427 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
428 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
429 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
436 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
438 mask &= thread->validate;
441 if (mask & DPSOFTRAST_VALIDATE_FB)
443 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
444 DPSOFTRAST_RecalcFB(thread);
446 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
448 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
449 DPSOFTRAST_RecalcDepthFunc(thread);
451 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
453 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
454 DPSOFTRAST_RecalcBlendFunc(thread);
458 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
460 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
461 return &dpsoftrast.texture[index];
465 static void DPSOFTRAST_Texture_Grow(void)
467 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
468 DPSOFTRAST_State_Thread *thread;
472 // expand texture array as needed
473 if (dpsoftrast.texture_max < 1024)
474 dpsoftrast.texture_max = 1024;
476 dpsoftrast.texture_max *= 2;
477 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
478 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
479 if (dpsoftrast.texbound[i])
480 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
481 for (j = 0; j < dpsoftrast.numthreads; j++)
483 thread = &dpsoftrast.threads[j];
484 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
485 if (thread->texbound[i])
486 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
490 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
499 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
500 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
501 DPSOFTRAST_Texture *texture;
502 if (width*height*depth < 1)
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
507 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
515 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
516 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
518 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
519 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
538 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
541 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
546 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
556 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
561 // find first empty slot in texture array
562 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
563 if (!dpsoftrast.texture[texnum].bytes)
565 dpsoftrast.texture_firstfree = texnum + 1;
566 if (dpsoftrast.texture_max <= texnum)
567 DPSOFTRAST_Texture_Grow();
568 if (dpsoftrast.texture_end <= texnum)
569 dpsoftrast.texture_end = texnum + 1;
570 texture = &dpsoftrast.texture[texnum];
571 memset(texture, 0, sizeof(*texture));
572 texture->flags = flags;
573 texture->width = width;
574 texture->height = height;
575 texture->depth = depth;
576 texture->sides = sides;
588 s = w * h * d * sides * 4;
589 texture->mipmap[mipmaps][0] = size;
590 texture->mipmap[mipmaps][1] = s;
591 texture->mipmap[mipmaps][2] = w;
592 texture->mipmap[mipmaps][3] = h;
593 texture->mipmap[mipmaps][4] = d;
596 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
602 texture->mipmaps = mipmaps;
603 texture->size = size;
605 // allocate the pixels now
606 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
610 void DPSOFTRAST_Texture_Free(int index)
612 DPSOFTRAST_Texture *texture;
613 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
617 MM_FREE(texture->bytes);
618 texture->bytes = NULL;
619 memset(texture, 0, sizeof(*texture));
620 // adjust the free range and used range
621 if (dpsoftrast.texture_firstfree > index)
622 dpsoftrast.texture_firstfree = index;
623 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
624 dpsoftrast.texture_end--;
626 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
628 int i, x, y, z, w, layer0, layer1, row0, row1;
629 unsigned char *o, *i0, *i1, *i2, *i3;
630 DPSOFTRAST_Texture *texture;
631 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
632 if (texture->mipmaps <= 1)
634 for (i = 1;i < texture->mipmaps;i++)
636 for (z = 0;z < texture->mipmap[i][4];z++)
640 if (layer1 >= texture->mipmap[i-1][4])
641 layer1 = texture->mipmap[i-1][4]-1;
642 for (y = 0;y < texture->mipmap[i][3];y++)
646 if (row1 >= texture->mipmap[i-1][3])
647 row1 = texture->mipmap[i-1][3]-1;
648 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
649 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
650 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
651 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
652 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
653 w = texture->mipmap[i][2];
656 if (texture->mipmap[i-1][2] > 1)
658 // average 3D texture
659 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
661 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
662 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
663 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
664 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669 // average 3D mipmap with parent width == 1
670 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
672 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
673 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
674 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
675 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
681 if (texture->mipmap[i-1][2] > 1)
683 // average 2D texture (common case)
684 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
686 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
687 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
688 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
689 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694 // 2D texture with parent width == 1
695 o[0] = (i0[0] + i1[0] + 1) >> 1;
696 o[1] = (i0[1] + i1[1] + 1) >> 1;
697 o[2] = (i0[2] + i1[2] + 1) >> 1;
698 o[3] = (i0[3] + i1[3] + 1) >> 1;
705 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
707 DPSOFTRAST_Texture *texture;
709 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
715 while (blockheight > 0)
717 memcpy(dst, pixels, blockwidth * 4);
718 pixels += blockwidth * 4;
719 dst += texture->mipmap[0][2] * 4;
723 DPSOFTRAST_Texture_CalculateMipmaps(index);
725 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
727 DPSOFTRAST_Texture *texture;
728 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
733 DPSOFTRAST_Texture_CalculateMipmaps(index);
735 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
739 return texture->mipmap[mip][2];
741 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
743 DPSOFTRAST_Texture *texture;
744 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
745 return texture->mipmap[mip][3];
747 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
751 return texture->mipmap[mip][4];
753 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
755 DPSOFTRAST_Texture *texture;
756 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
759 return texture->bytes + texture->mipmap[mip][0];
761 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
763 DPSOFTRAST_Texture *texture;
764 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
765 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
767 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772 texture->filter = filter;
775 static void DPSOFTRAST_Draw_FlushThreads(void);
777 static void DPSOFTRAST_Draw_SyncCommands(void)
779 if(dpsoftrast.usethreads) MEMORY_BARRIER;
780 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
783 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
785 DPSOFTRAST_State_Thread *thread;
787 int freecommand = dpsoftrast.commandpool.freecommand;
788 int usedcommands = dpsoftrast.commandpool.usedcommands;
789 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
791 DPSOFTRAST_Draw_SyncCommands();
797 for (i = 0; i < dpsoftrast.numthreads; i++)
799 thread = &dpsoftrast.threads[i];
800 commandoffset = freecommand - thread->commandoffset;
801 if (commandoffset < 0)
802 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
803 if (commandoffset > usedcommands)
806 usedcommands = commandoffset;
809 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
811 thread = &dpsoftrast.threads[waitindex];
812 Thread_LockMutex(thread->drawmutex);
813 if (thread->commandoffset != dpsoftrast.drawcommand)
815 thread->waiting = true;
816 if (thread->starving) Thread_CondSignal(thread->drawcond);
817 Thread_CondWait(thread->waitcond, thread->drawmutex);
818 thread->waiting = false;
820 Thread_UnlockMutex(thread->drawmutex);
822 dpsoftrast.commandpool.usedcommands = usedcommands;
825 #define DPSOFTRAST_ALIGNCOMMAND(size) \
826 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
827 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
828 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
830 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
832 DPSOFTRAST_Command *command;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 int extra = sizeof(DPSOFTRAST_Command);
836 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
837 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
840 if (dpsoftrast.usethreads)
841 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
843 DPSOFTRAST_Draw_FlushThreads();
844 freecommand = dpsoftrast.commandpool.freecommand;
845 usedcommands = dpsoftrast.commandpool.usedcommands;
847 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
849 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850 command->opcode = DPSOFTRAST_OPCODE_Reset;
851 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
854 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855 command->opcode = opcode;
856 command->commandsize = size;
858 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
860 dpsoftrast.commandpool.freecommand = freecommand;
861 dpsoftrast.commandpool.usedcommands = usedcommands + size;
865 static void DPSOFTRAST_UndoCommand(int size)
867 int freecommand = dpsoftrast.commandpool.freecommand;
868 int usedcommands = dpsoftrast.commandpool.usedcommands;
871 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
872 usedcommands -= size;
873 dpsoftrast.commandpool.freecommand = freecommand;
874 dpsoftrast.commandpool.usedcommands = usedcommands;
877 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
878 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
880 thread->viewport[0] = command->x;
881 thread->viewport[1] = command->y;
882 thread->viewport[2] = command->width;
883 thread->viewport[3] = command->height;
884 thread->validate |= DPSOFTRAST_VALIDATE_FB;
886 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
888 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
891 command->width = width;
892 command->height = height;
894 dpsoftrast.viewport[0] = x;
895 dpsoftrast.viewport[1] = y;
896 dpsoftrast.viewport[2] = width;
897 dpsoftrast.viewport[3] = height;
898 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
901 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
902 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
904 int i, x1, y1, x2, y2, w, h, x, y;
905 int miny1, maxy1, miny2, maxy2;
909 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
910 miny1 = thread->miny1;
911 maxy1 = thread->maxy1;
912 miny2 = thread->miny2;
913 maxy2 = thread->maxy2;
914 x1 = thread->fb_scissor[0];
915 y1 = thread->fb_scissor[1];
916 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
917 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
918 if (y1 < miny1) y1 = miny1;
919 if (y2 > maxy2) y2 = maxy2;
924 // FIXME: honor fb_colormask?
925 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
926 for (i = 0;i < 4;i++)
928 if (!dpsoftrast.fb_colorpixels[i])
930 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
933 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
934 for (x = x1;x < x2;x++)
939 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
941 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
948 DEFCOMMAND(3, ClearDepth, float depth;)
949 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
951 int x1, y1, x2, y2, w, h, x, y;
952 int miny1, maxy1, miny2, maxy2;
956 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957 miny1 = thread->miny1;
958 maxy1 = thread->maxy1;
959 miny2 = thread->miny2;
960 maxy2 = thread->maxy2;
961 x1 = thread->fb_scissor[0];
962 y1 = thread->fb_scissor[1];
963 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
964 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
965 if (y1 < miny1) y1 = miny1;
966 if (y2 > maxy2) y2 = maxy2;
971 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
972 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
975 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
976 for (x = x1;x < x2;x++)
980 void DPSOFTRAST_ClearDepth(float d)
982 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
986 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
987 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
989 thread->colormask[0] = command->r != 0;
990 thread->colormask[1] = command->g != 0;
991 thread->colormask[2] = command->b != 0;
992 thread->colormask[3] = command->a != 0;
993 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
995 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
997 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1004 DEFCOMMAND(5, DepthTest, int enable;)
1005 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1007 thread->depthtest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1010 void DPSOFTRAST_DepthTest(int enable)
1012 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1013 command->enable = enable;
1016 DEFCOMMAND(6, ScissorTest, int enable;)
1017 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1019 thread->scissortest = command->enable;
1020 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1022 void DPSOFTRAST_ScissorTest(int enable)
1024 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1025 command->enable = enable;
1028 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1029 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1031 thread->scissor[0] = command->x;
1032 thread->scissor[1] = command->y;
1033 thread->scissor[2] = command->width;
1034 thread->scissor[3] = command->height;
1035 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1037 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1039 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1042 command->width = width;
1043 command->height = height;
1046 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1047 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1049 thread->blendfunc[0] = command->sfactor;
1050 thread->blendfunc[1] = command->dfactor;
1051 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1053 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1055 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1056 command->sfactor = sfactor;
1057 command->dfactor = dfactor;
1060 DEFCOMMAND(9, BlendSubtract, int enable;)
1061 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1063 thread->blendsubtract = command->enable;
1064 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1066 void DPSOFTRAST_BlendSubtract(int enable)
1068 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1069 command->enable = enable;
1072 DEFCOMMAND(10, DepthMask, int enable;)
1073 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1075 thread->depthmask = command->enable;
1077 void DPSOFTRAST_DepthMask(int enable)
1079 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1080 command->enable = enable;
1083 DEFCOMMAND(11, DepthFunc, int func;)
1084 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1086 thread->depthfunc = command->func;
1088 void DPSOFTRAST_DepthFunc(int func)
1090 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1091 command->func = func;
1094 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1095 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1097 thread->depthrange[0] = command->nearval;
1098 thread->depthrange[1] = command->farval;
1100 void DPSOFTRAST_DepthRange(float nearval, float farval)
1102 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1103 command->nearval = nearval;
1104 command->farval = farval;
1107 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1108 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1110 thread->polygonoffset[0] = command->alongnormal;
1111 thread->polygonoffset[1] = command->intoview;
1113 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1115 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1116 command->alongnormal = alongnormal;
1117 command->intoview = intoview;
1120 DEFCOMMAND(14, CullFace, int mode;)
1121 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1123 thread->cullface = command->mode;
1125 void DPSOFTRAST_CullFace(int mode)
1127 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1128 command->mode = mode;
1131 DEFCOMMAND(15, AlphaTest, int enable;)
1132 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1134 thread->alphatest = command->enable;
1136 void DPSOFTRAST_AlphaTest(int enable)
1138 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1139 command->enable = enable;
1142 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1143 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1145 thread->alphafunc = command->func;
1146 thread->alphavalue = command->ref;
1148 void DPSOFTRAST_AlphaFunc(int func, float ref)
1150 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1151 command->func = func;
1155 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1157 dpsoftrast.color[0] = r;
1158 dpsoftrast.color[1] = g;
1159 dpsoftrast.color[2] = b;
1160 dpsoftrast.color[3] = a;
1163 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1165 int outstride = blockwidth * 4;
1166 int instride = dpsoftrast.fb_width * 4;
1169 int bx2 = blockx + blockwidth;
1170 int by2 = blocky + blockheight;
1174 unsigned char *inpixels;
1178 if (bx1 < 0) bx1 = 0;
1179 if (by1 < 0) by1 = 0;
1180 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1181 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1183 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1184 if (dpsoftrast.bigendian)
1186 for (y = by1;y < by2;y++)
1188 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189 o = (unsigned char *)outpixels + (y - by1) * outstride;
1190 for (x = bx1;x < bx2;x++)
1203 for (y = by1;y < by2;y++)
1205 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1206 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1216 int tx2 = tx + width;
1217 int ty2 = ty + height;
1220 int sx2 = sx + width;
1221 int sy2 = sy + height;
1231 unsigned int *spixels;
1232 unsigned int *tpixels;
1233 DPSOFTRAST_Texture *texture;
1234 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1235 if (mip < 0 || mip >= texture->mipmaps) return;
1237 spixels = dpsoftrast.fb_colorpixels[0];
1238 swidth = dpsoftrast.fb_width;
1239 sheight = dpsoftrast.fb_height;
1240 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1241 twidth = texture->mipmap[mip][2];
1242 theight = texture->mipmap[mip][3];
1243 if (tx1 < 0) tx1 = 0;
1244 if (ty1 < 0) ty1 = 0;
1245 if (tx2 > twidth) tx2 = twidth;
1246 if (ty2 > theight) ty2 = theight;
1247 if (sx1 < 0) sx1 = 0;
1248 if (sy1 < 0) sy1 = 0;
1249 if (sx2 > swidth) sx2 = swidth;
1250 if (sy2 > sheight) sy2 = sheight;
1255 if (tw > sw) tw = sw;
1256 if (th > sh) th = sh;
1257 if (tw < 1 || th < 1)
1259 sy1 = sheight - 1 - sy1;
1260 for (y = 0;y < th;y++)
1261 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1262 if (texture->mipmaps > 1)
1263 DPSOFTRAST_Texture_CalculateMipmaps(index);
1266 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1267 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1269 if (thread->texbound[command->unitnum])
1270 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1271 thread->texbound[command->unitnum] = command->texture;
1273 void DPSOFTRAST_SetTexture(int unitnum, int index)
1275 DPSOFTRAST_Command_SetTexture *command;
1276 DPSOFTRAST_Texture *texture;
1277 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1279 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1282 texture = DPSOFTRAST_Texture_GetByIndex(index);
1283 if (index && !texture)
1285 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1289 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1290 command->unitnum = unitnum;
1291 command->texture = texture;
1293 dpsoftrast.texbound[unitnum] = texture;
1294 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1297 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1299 dpsoftrast.pointer_vertex3f = vertex3f;
1300 dpsoftrast.stride_vertex = stride;
1302 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1304 dpsoftrast.pointer_color4f = color4f;
1305 dpsoftrast.pointer_color4ub = NULL;
1306 dpsoftrast.stride_color = stride;
1308 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1310 dpsoftrast.pointer_color4f = NULL;
1311 dpsoftrast.pointer_color4ub = color4ub;
1312 dpsoftrast.stride_color = stride;
1314 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1316 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1317 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1318 dpsoftrast.stride_texcoord[unitnum] = stride;
1321 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1322 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1324 thread->shader_mode = command->mode;
1325 thread->shader_permutation = command->permutation;
1326 thread->shader_exactspecularmath = command->exactspecularmath;
1328 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1330 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1331 command->mode = mode;
1332 command->permutation = permutation;
1333 command->exactspecularmath = exactspecularmath;
1335 dpsoftrast.shader_mode = mode;
1336 dpsoftrast.shader_permutation = permutation;
1337 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1340 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1341 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1343 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1345 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1347 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348 command->index = index;
1349 command->val[0] = v0;
1350 command->val[1] = v1;
1351 command->val[2] = v2;
1352 command->val[3] = v3;
1354 dpsoftrast.uniform4f[index*4+0] = v0;
1355 dpsoftrast.uniform4f[index*4+1] = v1;
1356 dpsoftrast.uniform4f[index*4+2] = v2;
1357 dpsoftrast.uniform4f[index*4+3] = v3;
1359 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1361 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362 command->index = index;
1363 memcpy(command->val, v, sizeof(command->val));
1365 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1368 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1369 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1371 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1373 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1377 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1379 __m128 m0, m1, m2, m3;
1380 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1381 command->index = (DPSOFTRAST_UNIFORM)index;
1382 if (((size_t)v)&(ALIGN_SIZE-1))
1384 m0 = _mm_loadu_ps(v);
1385 m1 = _mm_loadu_ps(v+4);
1386 m2 = _mm_loadu_ps(v+8);
1387 m3 = _mm_loadu_ps(v+12);
1391 m0 = _mm_load_ps(v);
1392 m1 = _mm_load_ps(v+4);
1393 m2 = _mm_load_ps(v+8);
1394 m3 = _mm_load_ps(v+12);
1398 __m128 t0, t1, t2, t3;
1399 t0 = _mm_unpacklo_ps(m0, m1);
1400 t1 = _mm_unpacklo_ps(m2, m3);
1401 t2 = _mm_unpackhi_ps(m0, m1);
1402 t3 = _mm_unpackhi_ps(m2, m3);
1403 m0 = _mm_movelh_ps(t0, t1);
1404 m1 = _mm_movehl_ps(t1, t0);
1405 m2 = _mm_movelh_ps(t2, t3);
1406 m3 = _mm_movehl_ps(t3, t2);
1408 _mm_store_ps(command->val, m0);
1409 _mm_store_ps(command->val+4, m1);
1410 _mm_store_ps(command->val+8, m2);
1411 _mm_store_ps(command->val+12, m3);
1412 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1413 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1414 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1415 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1421 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1423 thread->uniform1i[command->index] = command->val;
1425 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1427 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1428 command->index = index;
1431 dpsoftrast.uniform1i[command->index] = i0;
1435 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1437 float *end = dst + size*4;
1438 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1442 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1451 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1458 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1460 float *end = dst + size*4;
1461 if (stride == sizeof(float[3]))
1463 float *end4 = dst + (size&~3)*4;
1464 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1468 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1469 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482 src += 4*sizeof(float[3]);
1489 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1490 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1491 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1492 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1493 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1494 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1497 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1501 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 src += 4*sizeof(float[3]);
1507 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1511 __m128 v = _mm_loadu_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1524 __m128 v = _mm_load_ps((const float *)src);
1525 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1526 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1527 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1528 _mm_store_ps(dst, v);
1535 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1537 float *end = dst + size*4;
1538 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1539 if (stride == sizeof(float[2]))
1541 float *end2 = dst + (size&~1)*4;
1542 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1546 __m128 v = _mm_loadu_ps((const float *)src);
1547 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1550 src += 2*sizeof(float[2]);
1557 __m128 v = _mm_load_ps((const float *)src);
1558 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1559 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1561 src += 2*sizeof(float[2]);
1567 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1573 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 float *end = dst + size*4;
1576 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1577 if (stride == sizeof(unsigned char[4]))
1579 float *end4 = dst + (size&~3)*4;
1580 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1597 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1598 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1599 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1600 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1601 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1603 src += 4*sizeof(unsigned char[4]);
1609 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1610 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1616 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1618 float *end = dst + 4*size;
1619 __m128 v = _mm_loadu_ps(src);
1622 _mm_store_ps(dst, v);
1628 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1631 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1632 __m128 m0, m1, m2, m3;
1634 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1636 // fast case for identity matrix
1637 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1640 end = out4f + numitems*4;
1641 m0 = _mm_loadu_ps(inmatrix16f);
1642 m1 = _mm_loadu_ps(inmatrix16f + 4);
1643 m2 = _mm_loadu_ps(inmatrix16f + 8);
1644 m3 = _mm_loadu_ps(inmatrix16f + 12);
1645 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1649 __m128 v = _mm_loadu_ps(in4f);
1651 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 __m128 v = _mm_load_ps(in4f);
1665 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1666 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1667 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1668 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1676 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1678 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1682 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1684 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1690 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1692 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1693 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1694 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1695 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1698 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1701 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1704 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1707 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1709 int clipmask = 0xFF;
1710 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1711 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1712 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1713 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1714 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1715 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1716 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1717 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1718 #define BBFRONT(k, pos) \
1720 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1721 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1722 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1725 clipmask &= ~(1<<k); \
1726 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1727 minproj = _mm_min_ss(minproj, proj); \
1728 maxproj = _mm_max_ss(maxproj, proj); \
1732 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1733 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1734 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1735 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1736 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1737 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1741 if (clipmask&(1<<k)) \
1743 if (!(clipmask&(1<<(k^1)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1751 if (!(clipmask&(1<<(k^2)))) \
1753 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1754 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1755 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1756 minproj = _mm_min_ss(minproj, proj); \
1757 maxproj = _mm_max_ss(maxproj, proj); \
1759 if (!(clipmask&(1<<(k^4)))) \
1761 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1762 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1763 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1764 minproj = _mm_min_ss(minproj, proj); \
1765 maxproj = _mm_max_ss(maxproj, proj); \
1769 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1770 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1771 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1772 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1773 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1774 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1775 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1776 *starty = _mm_cvttss_si32(maxproj);
1777 *endy = _mm_cvttss_si32(minproj)+1;
1781 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1783 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1784 float *end = out4f + numitems*4;
1785 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1786 __m128 minpos, maxpos;
1787 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1789 minpos = maxpos = _mm_loadu_ps(in4f);
1792 __m128 v = _mm_loadu_ps(in4f);
1793 minpos = _mm_min_ps(minpos, v);
1794 maxpos = _mm_max_ps(maxpos, v);
1795 _mm_store_ps(out4f, v);
1796 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1797 _mm_store_ps(screen4f, v);
1805 minpos = maxpos = _mm_load_ps(in4f);
1808 __m128 v = _mm_load_ps(in4f);
1809 minpos = _mm_min_ps(minpos, v);
1810 maxpos = _mm_max_ps(maxpos, v);
1811 _mm_store_ps(out4f, v);
1812 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1813 _mm_store_ps(screen4f, v);
1821 ALIGN(float minposf[4]);
1822 ALIGN(float maxposf[4]);
1823 _mm_store_ps(minposf, minpos);
1824 _mm_store_ps(maxposf, maxpos);
1825 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1832 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1833 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1835 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1836 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1837 end = out4f + numitems*4;
1838 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1839 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1840 m0 = _mm_loadu_ps(inmatrix16f);
1841 m1 = _mm_loadu_ps(inmatrix16f + 4);
1842 m2 = _mm_loadu_ps(inmatrix16f + 8);
1843 m3 = _mm_loadu_ps(inmatrix16f + 12);
1844 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1846 minpos = maxpos = _mm_loadu_ps(in4f);
1849 __m128 v = _mm_loadu_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1853 _mm_store_ps(out4f, v);
1854 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1855 _mm_store_ps(screen4f, v);
1863 minpos = maxpos = _mm_load_ps(in4f);
1866 __m128 v = _mm_load_ps(in4f);
1867 minpos = _mm_min_ps(minpos, v);
1868 maxpos = _mm_max_ps(maxpos, v);
1869 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1870 _mm_store_ps(out4f, v);
1871 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1872 _mm_store_ps(screen4f, v);
1880 ALIGN(float minposf[4]);
1881 ALIGN(float maxposf[4]);
1882 _mm_store_ps(minposf, minpos);
1883 _mm_store_ps(maxposf, maxpos);
1884 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1890 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1893 float *outf = dpsoftrast.post_array4f[outarray];
1894 const unsigned char *inb;
1895 int firstvertex = dpsoftrast.firstvertex;
1896 int numvertices = dpsoftrast.numvertices;
1900 case DPSOFTRAST_ARRAY_POSITION:
1901 stride = dpsoftrast.stride_vertex;
1902 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1903 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1905 case DPSOFTRAST_ARRAY_COLOR:
1906 stride = dpsoftrast.stride_color;
1907 if (dpsoftrast.pointer_color4f)
1909 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1910 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1912 else if (dpsoftrast.pointer_color4ub)
1914 stride = dpsoftrast.stride_color;
1915 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1916 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1920 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1924 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1925 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1927 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1928 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1931 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1934 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1937 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1951 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1952 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1960 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1961 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1969 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1972 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1973 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1980 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1983 int startx = span->startx;
1984 int endx = span->endx;
1985 float wslope = triangle->w[0];
1986 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1987 float endz = 1.0f / (w + wslope * startx);
1988 for (x = startx;x < endx;)
1990 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1992 if (nextsub >= endx) nextsub = endsub = endx-1;
1993 endz = 1.0f / (w + wslope * nextsub);
1994 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1995 for (; x <= endsub; x++, z += dz)
2000 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2003 int startx = span->startx;
2004 int endx = span->endx;
2007 unsigned char * RESTRICT pixelmask = span->pixelmask;
2008 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2011 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2012 // handle alphatest now (this affects depth writes too)
2013 if (thread->alphatest)
2014 for (x = startx;x < endx;x++)
2015 if (in4f[x*4+3] < 0.5f)
2016 pixelmask[x] = false;
2017 // FIXME: this does not handle bigendian
2018 switch(thread->fb_blendmode)
2020 case DPSOFTRAST_BLENDMODE_OPAQUE:
2021 for (x = startx;x < endx;x++)
2025 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ALPHA:
2036 for (x = startx;x < endx;x++)
2040 a = in4f[x*4+3] * 255.0f;
2041 b = 1.0f - in4f[x*4+3];
2042 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2043 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2044 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2045 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2046 pixel[x*4+0] = d[0];
2047 pixel[x*4+1] = d[1];
2048 pixel[x*4+2] = d[2];
2049 pixel[x*4+3] = d[3];
2052 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2053 for (x = startx;x < endx;x++)
2057 a = in4f[x*4+3] * 255.0f;
2058 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2059 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2060 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2061 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2062 pixel[x*4+0] = d[0];
2063 pixel[x*4+1] = d[1];
2064 pixel[x*4+2] = d[2];
2065 pixel[x*4+3] = d[3];
2068 case DPSOFTRAST_BLENDMODE_ADD:
2069 for (x = startx;x < endx;x++)
2073 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2074 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2075 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2076 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2077 pixel[x*4+0] = d[0];
2078 pixel[x*4+1] = d[1];
2079 pixel[x*4+2] = d[2];
2080 pixel[x*4+3] = d[3];
2083 case DPSOFTRAST_BLENDMODE_INVMOD:
2084 for (x = startx;x < endx;x++)
2088 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2089 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2090 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2091 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2092 pixel[x*4+0] = d[0];
2093 pixel[x*4+1] = d[1];
2094 pixel[x*4+2] = d[2];
2095 pixel[x*4+3] = d[3];
2098 case DPSOFTRAST_BLENDMODE_MUL:
2099 for (x = startx;x < endx;x++)
2103 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2104 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2105 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2106 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2107 pixel[x*4+0] = d[0];
2108 pixel[x*4+1] = d[1];
2109 pixel[x*4+2] = d[2];
2110 pixel[x*4+3] = d[3];
2113 case DPSOFTRAST_BLENDMODE_MUL2:
2114 for (x = startx;x < endx;x++)
2118 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2129 for (x = startx;x < endx;x++)
2133 a = in4f[x*4+3] * -255.0f;
2134 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2135 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2136 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2137 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2138 pixel[x*4+0] = d[0];
2139 pixel[x*4+1] = d[1];
2140 pixel[x*4+2] = d[2];
2141 pixel[x*4+3] = d[3];
2144 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2145 for (x = startx;x < endx;x++)
2150 b = 1.0f - in4f[x*4+3];
2151 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2152 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2153 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2154 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2155 pixel[x*4+0] = d[0];
2156 pixel[x*4+1] = d[1];
2157 pixel[x*4+2] = d[2];
2158 pixel[x*4+3] = d[3];
2161 case DPSOFTRAST_BLENDMODE_INVADD:
2162 for (x = startx;x < endx;x++)
2166 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2167 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2168 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2169 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2170 pixel[x*4+0] = d[0];
2171 pixel[x*4+1] = d[1];
2172 pixel[x*4+2] = d[2];
2173 pixel[x*4+3] = d[3];
2179 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2183 int startx = span->startx;
2184 int endx = span->endx;
2185 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2186 unsigned char * RESTRICT pixelmask = span->pixelmask;
2187 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2188 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2191 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2192 pixeli += span->y * dpsoftrast.fb_width + span->x;
2193 // handle alphatest now (this affects depth writes too)
2194 if (thread->alphatest)
2195 for (x = startx;x < endx;x++)
2196 if (in4ub[x*4+3] < 0.5f)
2197 pixelmask[x] = false;
2198 // FIXME: this does not handle bigendian
2199 switch(thread->fb_blendmode)
2201 case DPSOFTRAST_BLENDMODE_OPAQUE:
2202 for (x = startx;x + 4 <= endx;)
2204 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2206 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2220 case DPSOFTRAST_BLENDMODE_ALPHA:
2221 #define FINISHBLEND(blend2, blend1) \
2222 for (x = startx;x + 1 < endx;x += 2) \
2225 switch (*(const unsigned short*)&pixelmask[x]) \
2228 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2229 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2231 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2234 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2235 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2237 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2240 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2241 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2243 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2248 for(;x < endx; x++) \
2251 if (!pixelmask[x]) \
2253 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2254 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2256 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2260 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2261 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2263 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2264 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2267 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2269 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 case DPSOFTRAST_BLENDMODE_ADD:
2277 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2279 case DPSOFTRAST_BLENDMODE_INVMOD:
2281 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2283 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2286 case DPSOFTRAST_BLENDMODE_MUL:
2287 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2289 case DPSOFTRAST_BLENDMODE_MUL2:
2290 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2292 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2294 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2295 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2297 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2298 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2301 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2303 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2304 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2306 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2307 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2310 case DPSOFTRAST_BLENDMODE_INVADD:
2312 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2314 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2321 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2324 int startx = span->startx;
2325 int endx = span->endx;
2330 float tc[2], endtc[2];
2332 unsigned int tci[2];
2333 unsigned int tci1[2];
2334 unsigned int tcimin[2];
2335 unsigned int tcimax[2];
2340 const unsigned char * RESTRICT pixelbase;
2341 const unsigned char * RESTRICT pixel[4];
2342 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2343 // if no texture is bound, just fill it with white
2346 for (x = startx;x < endx;x++)
2348 out4f[x*4+0] = 1.0f;
2349 out4f[x*4+1] = 1.0f;
2350 out4f[x*4+2] = 1.0f;
2351 out4f[x*4+3] = 1.0f;
2355 mip = triangle->mip[texunitindex];
2356 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2357 // if this mipmap of the texture is 1 pixel, just fill it with that color
2358 if (texture->mipmap[mip][1] == 4)
2360 c[0] = texture->bytes[2] * (1.0f/255.0f);
2361 c[1] = texture->bytes[1] * (1.0f/255.0f);
2362 c[2] = texture->bytes[0] * (1.0f/255.0f);
2363 c[3] = texture->bytes[3] * (1.0f/255.0f);
2364 for (x = startx;x < endx;x++)
2366 out4f[x*4+0] = c[0];
2367 out4f[x*4+1] = c[1];
2368 out4f[x*4+2] = c[2];
2369 out4f[x*4+3] = c[3];
2373 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2374 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2375 flags = texture->flags;
2376 tcscale[0] = texture->mipmap[mip][2];
2377 tcscale[1] = texture->mipmap[mip][3];
2378 tciwidth = texture->mipmap[mip][2];
2381 tcimax[0] = texture->mipmap[mip][2]-1;
2382 tcimax[1] = texture->mipmap[mip][3]-1;
2383 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2384 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2385 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2386 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2387 for (x = startx;x < endx;)
2389 unsigned int subtc[2];
2390 unsigned int substep[2];
2391 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2392 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2393 if (nextsub >= endx)
2395 nextsub = endsub = endx-1;
2396 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2400 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2401 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2402 substep[0] = (endtc[0] - tc[0]) * subscale;
2403 substep[1] = (endtc[1] - tc[1]) * subscale;
2404 subtc[0] = tc[0] * (1<<16);
2405 subtc[1] = tc[1] * (1<<16);
2408 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2410 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2412 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2413 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2414 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2415 tci[0] = subtc[0]>>16;
2416 tci[1] = subtc[1]>>16;
2417 tci1[0] = tci[0] + 1;
2418 tci1[1] = tci[1] + 1;
2419 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2420 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2421 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2422 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2423 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2424 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2425 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2426 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2427 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2428 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2429 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2430 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2431 out4f[x*4+0] = c[0];
2432 out4f[x*4+1] = c[1];
2433 out4f[x*4+2] = c[2];
2434 out4f[x*4+3] = c[3];
2439 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2441 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2442 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2443 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2444 tci[0] = subtc[0]>>16;
2445 tci[1] = subtc[1]>>16;
2446 tci1[0] = tci[0] + 1;
2447 tci1[1] = tci[1] + 1;
2448 tci[0] &= tciwrapmask[0];
2449 tci[1] &= tciwrapmask[1];
2450 tci1[0] &= tciwrapmask[0];
2451 tci1[1] &= tciwrapmask[1];
2452 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2453 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2454 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2455 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2456 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2457 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2458 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2459 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2460 out4f[x*4+0] = c[0];
2461 out4f[x*4+1] = c[1];
2462 out4f[x*4+2] = c[2];
2463 out4f[x*4+3] = c[3];
2467 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2469 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2471 tci[0] = subtc[0]>>16;
2472 tci[1] = subtc[1]>>16;
2473 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2474 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2475 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2476 c[0] = pixel[0][2] * (1.0f / 255.0f);
2477 c[1] = pixel[0][1] * (1.0f / 255.0f);
2478 c[2] = pixel[0][0] * (1.0f / 255.0f);
2479 c[3] = pixel[0][3] * (1.0f / 255.0f);
2480 out4f[x*4+0] = c[0];
2481 out4f[x*4+1] = c[1];
2482 out4f[x*4+2] = c[2];
2483 out4f[x*4+3] = c[3];
2488 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2490 tci[0] = subtc[0]>>16;
2491 tci[1] = subtc[1]>>16;
2492 tci[0] &= tciwrapmask[0];
2493 tci[1] &= tciwrapmask[1];
2494 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2495 c[0] = pixel[0][2] * (1.0f / 255.0f);
2496 c[1] = pixel[0][1] * (1.0f / 255.0f);
2497 c[2] = pixel[0][0] * (1.0f / 255.0f);
2498 c[3] = pixel[0][3] * (1.0f / 255.0f);
2499 out4f[x*4+0] = c[0];
2500 out4f[x*4+1] = c[1];
2501 out4f[x*4+2] = c[2];
2502 out4f[x*4+3] = c[3];
2508 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2512 int startx = span->startx;
2513 int endx = span->endx;
2515 __m128 data, slope, tcscale;
2516 __m128i tcsize, tcmask, tcoffset, tcmax;
2518 __m128i subtc, substep, endsubtc;
2521 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2522 const unsigned char * RESTRICT pixelbase;
2523 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2524 // if no texture is bound, just fill it with white
2527 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2530 mip = triangle->mip[texunitindex];
2531 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2532 // if this mipmap of the texture is 1 pixel, just fill it with that color
2533 if (texture->mipmap[mip][1] == 4)
2535 unsigned int k = *((const unsigned int *)pixelbase);
2536 for (x = startx;x < endx;x++)
2540 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2541 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2542 flags = texture->flags;
2543 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2544 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2545 tcscale = _mm_cvtepi32_ps(tcsize);
2546 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2547 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2548 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2549 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2550 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2551 tcmax = _mm_packs_epi32(tcmask, tcmask);
2552 for (x = startx;x < endx;)
2554 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2555 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2556 if (nextsub >= endx)
2558 nextsub = endsub = endx-1;
2559 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2563 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2564 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2565 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2566 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2567 substep = _mm_slli_epi32(substep, 1);
2570 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2571 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2573 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2574 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2576 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2577 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2578 tci = _mm_madd_epi16(tci, tcoffset);
2579 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2581 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2582 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2583 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2584 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2585 fracm = _mm_srli_epi16(subtc, 1);
2586 pix1 = _mm_add_epi16(pix1,
2587 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2589 pix3 = _mm_add_epi16(pix3,
2590 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2591 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2592 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2593 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2594 pix2 = _mm_add_epi16(pix2,
2595 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2596 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2597 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2601 const unsigned char * RESTRICT ptr1;
2602 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2603 tci = _mm_madd_epi16(tci, tcoffset);
2604 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2605 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2606 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2607 fracm = _mm_srli_epi16(subtc, 1);
2608 pix1 = _mm_add_epi16(pix1,
2609 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2610 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2611 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2612 pix1 = _mm_add_epi16(pix1,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2615 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2619 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2621 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2623 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2624 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2625 tci = _mm_madd_epi16(tci, tcoffset);
2626 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628 _mm_setzero_si128());
2629 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631 _mm_setzero_si128());
2632 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2633 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2634 tci = _mm_madd_epi16(tci, tcoffset);
2635 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637 _mm_setzero_si128());
2638 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640 _mm_setzero_si128());
2641 fracm = _mm_srli_epi16(subtc, 1);
2642 pix1 = _mm_add_epi16(pix1,
2643 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645 pix3 = _mm_add_epi16(pix3,
2646 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2647 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2648 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2649 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2650 pix2 = _mm_add_epi16(pix2,
2651 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2652 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2653 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2657 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2658 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 fracm = _mm_srli_epi16(subtc, 1);
2667 pix1 = _mm_add_epi16(pix1,
2668 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2671 pix1 = _mm_add_epi16(pix1,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2674 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2680 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2682 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2683 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684 tci = _mm_madd_epi16(tci, tcoffset);
2685 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687 _mm_setzero_si128());
2688 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690 _mm_setzero_si128());
2691 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2692 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693 tci = _mm_madd_epi16(tci, tcoffset);
2694 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696 _mm_setzero_si128());
2697 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699 _mm_setzero_si128());
2700 fracm = _mm_srli_epi16(subtc, 1);
2701 pix1 = _mm_add_epi16(pix1,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704 pix3 = _mm_add_epi16(pix3,
2705 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2706 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2707 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2708 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2709 pix2 = _mm_add_epi16(pix2,
2710 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2711 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2712 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2716 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2717 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2718 tci = _mm_madd_epi16(tci, tcoffset);
2719 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2720 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2721 _mm_setzero_si128());
2722 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2723 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2724 _mm_setzero_si128());
2725 fracm = _mm_srli_epi16(subtc, 1);
2726 pix1 = _mm_add_epi16(pix1,
2727 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2729 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2730 pix1 = _mm_add_epi16(pix1,
2731 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2733 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2740 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2742 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2744 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2745 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2746 tci = _mm_madd_epi16(tci, tcoffset);
2747 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2748 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2752 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2753 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2754 tci = _mm_madd_epi16(tci, tcoffset);
2755 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2761 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2763 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2764 tci = _mm_and_si128(tci, tcmax);
2765 tci = _mm_madd_epi16(tci, tcoffset);
2766 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2767 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2771 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2772 tci = _mm_and_si128(tci, tcmax);
2773 tci = _mm_madd_epi16(tci, tcoffset);
2774 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2783 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2786 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2789 float DPSOFTRAST_SampleShadowmap(const float *vector)
2795 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2798 int startx = span->startx;
2799 int endx = span->endx;
2804 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2805 for (x = startx;x < endx;x++)
2808 c[0] = (data[0] + slope[0]*x) * z;
2809 c[1] = (data[1] + slope[1]*x) * z;
2810 c[2] = (data[2] + slope[2]*x) * z;
2811 c[3] = (data[3] + slope[3]*x) * z;
2812 out4f[x*4+0] = in4f[x*4+0] * c[0];
2813 out4f[x*4+1] = in4f[x*4+1] * c[1];
2814 out4f[x*4+2] = in4f[x*4+2] * c[2];
2815 out4f[x*4+3] = in4f[x*4+3] * c[3];
2819 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2822 int startx = span->startx;
2823 int endx = span->endx;
2828 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2829 for (x = startx;x < endx;x++)
2832 c[0] = (data[0] + slope[0]*x) * z;
2833 c[1] = (data[1] + slope[1]*x) * z;
2834 c[2] = (data[2] + slope[2]*x) * z;
2835 c[3] = (data[3] + slope[3]*x) * z;
2836 out4f[x*4+0] = c[0];
2837 out4f[x*4+1] = c[1];
2838 out4f[x*4+2] = c[2];
2839 out4f[x*4+3] = c[3];
2843 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2845 int x, startx = span->startx, endx = span->endx;
2846 float c[4], localcolor[4];
2847 localcolor[0] = subcolor[0];
2848 localcolor[1] = subcolor[1];
2849 localcolor[2] = subcolor[2];
2850 localcolor[3] = subcolor[3];
2851 for (x = startx;x < endx;x++)
2853 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2854 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2855 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2856 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2857 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2858 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2859 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2860 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2864 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2866 int x, startx = span->startx, endx = span->endx;
2867 for (x = startx;x < endx;x++)
2869 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2870 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2871 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2872 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2876 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2878 int x, startx = span->startx, endx = span->endx;
2879 for (x = startx;x < endx;x++)
2881 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2882 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2883 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2884 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2888 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2890 int x, startx = span->startx, endx = span->endx;
2892 for (x = startx;x < endx;x++)
2894 a = 1.0f - inb4f[x*4+3];
2896 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2897 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2898 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2899 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2903 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2905 int x, startx = span->startx, endx = span->endx;
2906 float localcolor[4], ilerp, lerp;
2907 localcolor[0] = color[0];
2908 localcolor[1] = color[1];
2909 localcolor[2] = color[2];
2910 localcolor[3] = color[3];
2911 ilerp = 1.0f - localcolor[3];
2912 lerp = localcolor[3];
2913 for (x = startx;x < endx;x++)
2915 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2916 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2917 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2918 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2924 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2928 int startx = span->startx;
2929 int endx = span->endx;
2932 __m128i submod, substep, endsubmod;
2933 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2934 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2935 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2936 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2937 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2938 for (x = startx; x < endx;)
2940 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2941 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2942 if (nextsub >= endx)
2944 nextsub = endsub = endx-1;
2945 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2949 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2950 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2951 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2952 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2953 substep = _mm_packs_epi32(substep, substep);
2954 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2956 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2957 pix = _mm_mulhi_epu16(pix, submod);
2958 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2962 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2963 pix = _mm_mulhi_epu16(pix, submod);
2964 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2971 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2975 int startx = span->startx;
2976 int endx = span->endx;
2979 __m128i submod, substep, endsubmod;
2980 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2981 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2982 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2983 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2984 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2985 for (x = startx; x < endx;)
2987 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2988 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2989 if (nextsub >= endx)
2991 nextsub = endsub = endx-1;
2992 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2996 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2997 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2998 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2999 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3000 substep = _mm_packs_epi32(substep, substep);
3001 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3003 __m128i pix = _mm_srai_epi16(submod, 4);
3004 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3008 __m128i pix = _mm_srai_epi16(submod, 4);
3009 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3016 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3019 int x, startx = span->startx, endx = span->endx;
3020 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3021 localcolor = _mm_packs_epi32(localcolor, localcolor);
3022 for (x = startx;x+2 <= endx;x+=2)
3024 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3025 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3026 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3027 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3031 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3032 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3033 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3034 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3039 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3042 int x, startx = span->startx, endx = span->endx;
3043 for (x = startx;x+2 <= endx;x+=2)
3045 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3046 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3047 pix1 = _mm_mulhi_epu16(pix1, pix2);
3048 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3052 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3053 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3054 pix1 = _mm_mulhi_epu16(pix1, pix2);
3055 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3060 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3063 int x, startx = span->startx, endx = span->endx;
3064 for (x = startx;x+2 <= endx;x+=2)
3066 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068 pix1 = _mm_add_epi16(pix1, pix2);
3069 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3073 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075 pix1 = _mm_add_epi16(pix1, pix2);
3076 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3081 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3084 int x, startx = span->startx, endx = span->endx;
3085 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3086 tint = _mm_packs_epi32(tint, tint);
3087 for (x = startx;x+2 <= endx;x+=2)
3089 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3090 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3091 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3092 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3096 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3097 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3098 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3099 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3104 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3107 int x, startx = span->startx, endx = span->endx;
3108 for (x = startx;x+2 <= endx;x+=2)
3110 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3111 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3112 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3113 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3114 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3118 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3120 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3121 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3122 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3127 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3130 int x, startx = span->startx, endx = span->endx;
3131 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3132 localcolor = _mm_packs_epi32(localcolor, localcolor);
3133 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3134 for (x = startx;x+2 <= endx;x+=2)
3136 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3137 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3138 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3142 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3143 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3144 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3151 void DPSOFTRAST_VertexShader_Generic(void)
3153 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3154 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3155 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3156 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3157 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3160 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3162 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3163 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3164 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3165 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3166 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3167 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3169 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3170 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3171 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3173 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3174 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3177 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3179 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3182 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3184 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3187 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3192 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3193 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3198 void DPSOFTRAST_VertexShader_PostProcess(void)
3200 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3201 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3202 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3205 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3207 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3208 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3209 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3211 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3213 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3215 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3216 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3218 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3219 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3221 // TODO: implement saturation
3223 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3225 // TODO: implement gammaramps
3227 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3232 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3234 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3237 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3239 // this is never called (because colormask is off when this shader is used)
3240 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3241 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3242 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3243 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3244 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3249 void DPSOFTRAST_VertexShader_FlatColor(void)
3251 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3252 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3255 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3258 unsigned char * RESTRICT pixelmask = span->pixelmask;
3259 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3260 int x, startx = span->startx, endx = span->endx;
3261 __m128i Color_Ambientm;
3262 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3266 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3267 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3268 pixel = buffer_FragColorbgra8;
3269 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3270 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3271 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3272 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3273 for (x = startx;x < endx;x++)
3276 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3279 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3280 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3281 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3282 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3288 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3289 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3290 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3292 if (pixel == buffer_FragColorbgra8)
3293 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3299 void DPSOFTRAST_VertexShader_VertexColor(void)
3301 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3302 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3303 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3306 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3309 unsigned char * RESTRICT pixelmask = span->pixelmask;
3310 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3311 int x, startx = span->startx, endx = span->endx;
3312 __m128i Color_Ambientm, Color_Diffusem;
3314 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3317 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3318 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3319 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3320 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3321 pixel = buffer_FragColorbgra8;
3322 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3323 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3324 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3325 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3326 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3327 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3328 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3329 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3330 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3331 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3332 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3333 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3334 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3335 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3337 __m128i color, mod, pix;
3338 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3341 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3342 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3343 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3344 data = _mm_add_ps(data, slope);
3345 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3346 data = _mm_add_ps(data, slope);
3347 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3348 data = _mm_add_ps(data, slope);
3349 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3350 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3351 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3352 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3353 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3354 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3360 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3361 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3362 mod = _mm_packs_epi32(mod, mod);
3363 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3364 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3366 if (pixel == buffer_FragColorbgra8)
3367 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3373 void DPSOFTRAST_VertexShader_Lightmap(void)
3375 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3377 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3380 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3383 unsigned char * RESTRICT pixelmask = span->pixelmask;
3384 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385 int x, startx = span->startx, endx = span->endx;
3386 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3387 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3388 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3392 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3394 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3395 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3396 pixel = buffer_FragColorbgra8;
3397 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3398 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3399 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3400 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3401 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3404 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3406 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3407 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3408 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3409 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3410 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3411 for (x = startx;x < endx;x++)
3413 __m128i color, lightmap, glow, pix;
3414 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3417 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3418 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3419 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3420 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3421 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3422 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3423 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3424 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3425 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3426 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3432 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3433 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3434 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3435 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3436 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3437 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3442 for (x = startx;x < endx;x++)
3444 __m128i color, lightmap, pix;
3445 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3448 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3449 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3450 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3451 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3452 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3453 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3454 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3460 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3461 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3462 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3463 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3466 if (pixel == buffer_FragColorbgra8)
3467 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3472 void DPSOFTRAST_VertexShader_LightDirection(void);
3473 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3475 void DPSOFTRAST_VertexShader_FakeLight(void)
3477 DPSOFTRAST_VertexShader_LightDirection();
3480 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3482 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3487 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3489 DPSOFTRAST_VertexShader_LightDirection();
3490 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3493 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3495 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3500 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3502 DPSOFTRAST_VertexShader_LightDirection();
3503 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3506 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3508 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3513 void DPSOFTRAST_VertexShader_LightDirection(void)
3516 int numvertices = dpsoftrast.numvertices;
3518 float LightVector[4];
3519 float EyePosition[4];
3520 float EyeVectorModelSpace[4];
3526 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3527 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3528 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3529 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3530 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3531 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3532 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3533 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3534 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3535 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3536 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3537 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3538 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3539 for (i = 0;i < numvertices;i++)
3541 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3542 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3543 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3544 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3545 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3546 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3547 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3548 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3549 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3550 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3551 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3552 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3553 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3554 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3555 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3556 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3557 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3558 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3559 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3560 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3561 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3562 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3563 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3564 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3565 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3566 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3567 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3568 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3569 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3571 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3574 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3575 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3576 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3577 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3578 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3579 #define DPSOFTRAST_Vector3Normalize(v)\
3582 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3593 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3595 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3596 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3597 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3598 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3599 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3600 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3601 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3602 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3603 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3604 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3605 int x, startx = span->startx, endx = span->endx;
3606 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3607 float LightVectordata[4];
3608 float LightVectorslope[4];
3609 float EyeVectordata[4];
3610 float EyeVectorslope[4];
3611 float VectorSdata[4];
3612 float VectorSslope[4];
3613 float VectorTdata[4];
3614 float VectorTslope[4];
3615 float VectorRdata[4];
3616 float VectorRslope[4];
3618 float diffusetex[4];
3620 float surfacenormal[4];
3621 float lightnormal[4];
3622 float lightnormal_modelspace[4];
3624 float specularnormal[4];
3627 float SpecularPower;
3629 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3630 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3631 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3632 Color_Glow[3] = 0.0f;
3633 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3634 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3635 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3636 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3637 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3638 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3639 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3640 Color_Pants[3] = 0.0f;
3641 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3642 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3643 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3644 Color_Shirt[3] = 0.0f;
3645 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3646 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3649 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3650 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3652 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3654 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3656 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3658 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3659 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3660 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3661 Color_Diffuse[3] = 0.0f;
3662 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3663 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3664 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3665 LightColor[3] = 0.0f;
3666 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3667 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3668 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3669 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3670 Color_Specular[3] = 0.0f;
3671 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3672 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3673 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3675 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3677 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3678 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3679 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3680 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3681 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3683 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3685 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3686 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3688 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3690 // nothing of this needed
3694 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3697 for (x = startx;x < endx;x++)
3700 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3701 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3702 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3703 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3704 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3706 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3707 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3708 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3709 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3711 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3712 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3713 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3714 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3715 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3716 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3717 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3718 DPSOFTRAST_Vector3Normalize(surfacenormal);
3720 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3722 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3723 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3724 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3725 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3727 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3728 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3729 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3730 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3732 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3733 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3734 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3735 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3737 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3738 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3739 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3740 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3742 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3743 DPSOFTRAST_Vector3Normalize(lightnormal);
3745 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3747 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3748 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3749 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3750 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3753 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3755 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3756 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3757 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3759 float f = 1.0f / 256.0f;
3760 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3761 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3762 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3765 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3767 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3768 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3769 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3770 DPSOFTRAST_Vector3Normalize(lightnormal);
3772 LightColor[0] = 1.0;
3773 LightColor[1] = 1.0;
3774 LightColor[2] = 1.0;
3778 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3779 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3780 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3781 DPSOFTRAST_Vector3Normalize(lightnormal);
3784 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3786 if(thread->shader_exactspecularmath)
3788 // reflect lightnormal at surfacenormal, take the negative of that
3789 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3791 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3792 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3793 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3794 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3796 // dot of this and normalize(EyeVectorFogDepth.xyz)
3797 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3798 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3799 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3800 DPSOFTRAST_Vector3Normalize(eyenormal);
3802 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3806 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3807 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3808 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3809 DPSOFTRAST_Vector3Normalize(eyenormal);
3811 specularnormal[0] = lightnormal[0] + eyenormal[0];
3812 specularnormal[1] = lightnormal[1] + eyenormal[1];
3813 specularnormal[2] = lightnormal[2] + eyenormal[2];
3814 DPSOFTRAST_Vector3Normalize(specularnormal);
3816 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3819 specular = pow(specular, SpecularPower * glosstex[3]);
3820 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3822 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3823 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3824 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3825 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3829 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3830 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3831 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3832 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3835 buffer_FragColorbgra8[x*4+0] = d[0];
3836 buffer_FragColorbgra8[x*4+1] = d[1];
3837 buffer_FragColorbgra8[x*4+2] = d[2];
3838 buffer_FragColorbgra8[x*4+3] = d[3];
3841 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3843 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3844 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3845 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3846 Color_Diffuse[3] = 0.0f;
3847 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3848 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3849 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3850 LightColor[3] = 0.0f;
3851 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3853 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3855 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3856 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3857 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3858 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3859 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3861 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3863 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3864 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3866 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3868 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3872 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3875 for (x = startx;x < endx;x++)
3878 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3879 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3880 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3881 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3882 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3883 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3884 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3885 DPSOFTRAST_Vector3Normalize(surfacenormal);
3887 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3889 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3890 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3891 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3892 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3894 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3895 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3896 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3897 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3899 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3900 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3901 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3902 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3904 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3905 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3906 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3907 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3909 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3910 DPSOFTRAST_Vector3Normalize(lightnormal);
3912 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3914 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3915 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3916 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3917 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3920 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3922 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3923 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3924 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3926 float f = 1.0f / 256.0f;
3927 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3928 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3929 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3932 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3934 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3935 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3936 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3937 DPSOFTRAST_Vector3Normalize(lightnormal);
3939 LightColor[0] = 1.0;
3940 LightColor[1] = 1.0;
3941 LightColor[2] = 1.0;
3945 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3946 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3947 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3948 DPSOFTRAST_Vector3Normalize(lightnormal);
3951 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3952 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3954 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3955 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3956 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3957 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3961 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3962 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3963 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3964 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3966 buffer_FragColorbgra8[x*4+0] = d[0];
3967 buffer_FragColorbgra8[x*4+1] = d[1];
3968 buffer_FragColorbgra8[x*4+2] = d[2];
3969 buffer_FragColorbgra8[x*4+3] = d[3];
3974 for (x = startx;x < endx;x++)
3977 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3978 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3979 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3980 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3982 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3984 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3985 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3986 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3987 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3991 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3992 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3993 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3994 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3996 buffer_FragColorbgra8[x*4+0] = d[0];
3997 buffer_FragColorbgra8[x*4+1] = d[1];
3998 buffer_FragColorbgra8[x*4+2] = d[2];
3999 buffer_FragColorbgra8[x*4+3] = d[3];
4002 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4007 void DPSOFTRAST_VertexShader_LightSource(void)
4010 int numvertices = dpsoftrast.numvertices;
4011 float LightPosition[4];
4012 float LightVector[4];
4013 float LightVectorModelSpace[4];
4014 float EyePosition[4];
4015 float EyeVectorModelSpace[4];
4021 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4022 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4023 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4024 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4025 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4026 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4027 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4028 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4029 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4030 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4031 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4032 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4033 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4034 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4035 for (i = 0;i < numvertices;i++)
4037 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4038 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4039 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4040 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4041 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4042 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4043 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4044 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4045 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4046 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4047 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4048 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4049 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4050 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4051 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4052 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4053 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4054 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4055 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4056 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4057 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4058 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4059 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4060 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4061 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4062 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4063 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4064 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4065 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4066 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4067 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4068 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4070 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4071 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4074 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4077 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4078 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4079 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4080 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4081 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4082 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4083 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4084 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085 int x, startx = span->startx, endx = span->endx;
4086 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4087 float CubeVectordata[4];
4088 float CubeVectorslope[4];
4089 float LightVectordata[4];
4090 float LightVectorslope[4];
4091 float EyeVectordata[4];
4092 float EyeVectorslope[4];
4094 float diffusetex[4];
4096 float surfacenormal[4];
4097 float lightnormal[4];
4099 float specularnormal[4];
4102 float SpecularPower;
4103 float CubeVector[4];
4106 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4107 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4108 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4109 Color_Glow[3] = 0.0f;
4110 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4111 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4112 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4113 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4114 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4115 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4116 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4117 Color_Diffuse[3] = 0.0f;
4118 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4119 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4120 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4121 Color_Specular[3] = 0.0f;
4122 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4123 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4124 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4125 Color_Pants[3] = 0.0f;
4126 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4127 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4128 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4129 Color_Shirt[3] = 0.0f;
4130 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4131 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4132 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4133 LightColor[3] = 0.0f;
4134 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4135 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4136 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4137 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4138 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4139 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4140 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4141 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4143 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4144 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4146 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4147 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4148 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4150 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4151 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4152 for (x = startx;x < endx;x++)
4155 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4156 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4157 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4158 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4159 if (attenuation < 0.01f)
4161 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4163 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4164 if (attenuation < 0.01f)
4168 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4169 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4170 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4171 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4172 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4174 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4175 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4176 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4177 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4179 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4180 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4181 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4182 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4183 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4184 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4185 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4186 DPSOFTRAST_Vector3Normalize(surfacenormal);
4188 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4189 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4190 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4191 DPSOFTRAST_Vector3Normalize(lightnormal);
4193 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4195 if(thread->shader_exactspecularmath)
4197 // reflect lightnormal at surfacenormal, take the negative of that
4198 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4200 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4201 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4202 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4203 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4205 // dot of this and normalize(EyeVectorFogDepth.xyz)
4206 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4207 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4208 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4209 DPSOFTRAST_Vector3Normalize(eyenormal);
4211 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4215 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4216 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4217 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4218 DPSOFTRAST_Vector3Normalize(eyenormal);
4220 specularnormal[0] = lightnormal[0] + eyenormal[0];
4221 specularnormal[1] = lightnormal[1] + eyenormal[1];
4222 specularnormal[2] = lightnormal[2] + eyenormal[2];
4223 DPSOFTRAST_Vector3Normalize(specularnormal);
4225 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4227 specular = pow(specular, SpecularPower * glosstex[3]);
4229 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4231 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4232 attenuation *= (1.0f / 255.0f);
4233 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4234 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4235 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4236 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4240 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4241 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4242 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4243 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4245 buffer_FragColorbgra8[x*4+0] = d[0];
4246 buffer_FragColorbgra8[x*4+1] = d[1];
4247 buffer_FragColorbgra8[x*4+2] = d[2];
4248 buffer_FragColorbgra8[x*4+3] = d[3];
4251 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4253 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4254 for (x = startx;x < endx;x++)
4257 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4258 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4259 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4260 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4261 if (attenuation < 0.01f)
4263 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4265 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4266 if (attenuation < 0.01f)
4270 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4271 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4272 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4273 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4274 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4276 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4277 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4278 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4279 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4281 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4282 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4283 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4284 DPSOFTRAST_Vector3Normalize(surfacenormal);
4286 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4287 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4288 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4289 DPSOFTRAST_Vector3Normalize(lightnormal);
4291 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4292 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4294 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4295 attenuation *= (1.0f / 255.0f);
4296 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4297 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4298 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4299 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4303 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4304 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4305 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4306 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4308 buffer_FragColorbgra8[x*4+0] = d[0];
4309 buffer_FragColorbgra8[x*4+1] = d[1];
4310 buffer_FragColorbgra8[x*4+2] = d[2];
4311 buffer_FragColorbgra8[x*4+3] = d[3];
4316 for (x = startx;x < endx;x++)
4319 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4320 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4321 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4322 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4323 if (attenuation < 0.01f)
4325 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4327 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4328 if (attenuation < 0.01f)
4332 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4333 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4334 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4335 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4336 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4338 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4339 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4340 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4341 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4343 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4345 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4346 attenuation *= (1.0f / 255.0f);
4347 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4348 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4349 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4350 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4354 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4355 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4356 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4357 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4359 buffer_FragColorbgra8[x*4+0] = d[0];
4360 buffer_FragColorbgra8[x*4+1] = d[1];
4361 buffer_FragColorbgra8[x*4+2] = d[2];
4362 buffer_FragColorbgra8[x*4+3] = d[3];
4365 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4371 void DPSOFTRAST_VertexShader_Refraction(void)
4373 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4374 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4375 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4378 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4380 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4382 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4384 int x, startx = span->startx, endx = span->endx;
4387 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4388 //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4389 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4392 float ModelViewProjectionPositiondata[4];
4393 float ModelViewProjectionPositionslope[4];
4396 float ScreenScaleRefractReflect[2];
4397 float ScreenCenterRefractReflect[2];
4398 float DistortScaleRefractReflect[2];
4399 float RefractColor[4];
4401 const unsigned char * RESTRICT pixelbase;
4402 const unsigned char * RESTRICT pixel[4];
4403 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4404 if(!texture) return;
4405 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4408 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4409 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4410 //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4413 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4416 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4417 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4418 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4419 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4420 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4421 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4422 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4423 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4424 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4425 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4428 for (x = startx;x < endx;x++)
4430 float SafeScreenTexCoord[2];
4431 float ScreenTexCoord[2];
4438 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4439 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4441 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4442 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4443 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4445 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4446 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4447 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4448 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4449 DPSOFTRAST_Vector3Normalize(v);
4450 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4451 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4453 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4454 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4456 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4457 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4458 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4459 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4460 int tci[2] = { tc[0]>>16, tc[1]>>16 };
4461 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4462 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4463 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4464 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4465 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4466 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4467 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4468 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4469 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4470 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4471 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4472 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4476 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4477 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4478 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4479 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4480 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4481 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4482 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4488 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4489 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4490 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4491 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4492 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4495 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 void DPSOFTRAST_VertexShader_Water(void)
4502 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4506 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4509 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4510 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4511 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4512 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4513 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4518 void DPSOFTRAST_VertexShader_ShowDepth(void)
4520 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4523 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4526 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4527 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4528 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4529 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4530 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4535 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4537 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4540 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4543 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4544 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4545 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4546 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4547 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4552 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4554 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4557 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4560 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4561 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4562 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4563 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4564 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4569 typedef struct DPSOFTRAST_ShaderModeInfo_s
4572 void (*Vertex)(void);
4573 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4574 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4575 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4577 DPSOFTRAST_ShaderModeInfo;
4579 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4581 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4582 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4583 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4584 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4585 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4586 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4587 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4588 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4589 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4590 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4591 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4592 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4593 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4594 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4595 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4596 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4599 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4606 // unsigned int *colorpixel;
4607 unsigned int *depthpixel;
4613 DPSOFTRAST_State_Triangle *triangle;
4614 DPSOFTRAST_State_Span *span;
4615 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4616 for (i = 0; i < thread->numspans; i++)
4618 span = &thread->spans[i];
4619 triangle = &thread->triangles[span->triangle];
4620 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4622 wslope = triangle->w[0];
4623 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4624 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4625 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4626 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4627 startx = span->startx;
4629 switch(thread->fb_depthfunc)
4632 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4633 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4634 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4635 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4636 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4637 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4638 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4640 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4641 //for (x = startx;x < endx;x++)
4642 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4643 // if there is no color buffer, skip pixel shader
4644 while (startx < endx && !pixelmask[startx])
4646 while (endx > startx && !pixelmask[endx-1])
4649 continue; // no pixels to fill
4650 span->pixelmask = pixelmask;
4651 span->startx = startx;
4653 // run pixel shader if appropriate
4654 // do this before running depthmask code, to allow the pixelshader
4655 // to clear pixelmask values for alpha testing
4656 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4657 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4658 if (thread->depthmask)
4659 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4665 // no depth testing means we're just dealing with color...
4666 // if there is no color buffer, skip pixel shader
4667 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4669 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4670 span->pixelmask = pixelmask;
4671 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4675 thread->numspans = 0;
4678 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4680 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4683 int cullface = thread->cullface;
4684 int minx, maxx, miny, maxy;
4685 int miny1, maxy1, miny2, maxy2;
4686 __m128i fbmin, fbmax;
4687 __m128 viewportcenter, viewportscale;
4688 int firstvertex = command->firstvertex;
4689 int numvertices = command->numvertices;
4690 int numtriangles = command->numtriangles;
4691 const int *element3i = command->element3i;
4692 const unsigned short *element3s = command->element3s;
4693 int clipped = command->clipped;
4700 int starty, endy, bandy;
4704 __m128 triangleedge1, triangleedge2, trianglenormal;
4707 DPSOFTRAST_State_Triangle *triangle;
4708 DPSOFTRAST_Texture *texture;
4709 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4710 miny = thread->fb_scissor[1];
4711 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4712 miny1 = bound(miny, thread->miny1, maxy);
4713 maxy1 = bound(miny, thread->maxy1, maxy);
4714 miny2 = bound(miny, thread->miny2, maxy);
4715 maxy2 = bound(miny, thread->maxy2, maxy);
4716 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4718 if (!ATOMIC_DECREMENT(command->refcount))
4720 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721 MM_FREE(command->arrays);
4725 minx = thread->fb_scissor[0];
4726 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4727 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4728 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4729 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4730 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4731 screen[3] = _mm_setzero_ps();
4732 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4733 for (i = 0;i < numtriangles;i++)
4735 const float *screencoord4f = command->arrays;
4736 const float *arrays = screencoord4f + numvertices*4;
4738 // generate the 3 edges of this triangle
4739 // generate spans for the triangle - switch based on left split or right split classification of triangle
4742 e[0] = element3s[i*3+0] - firstvertex;
4743 e[1] = element3s[i*3+1] - firstvertex;
4744 e[2] = element3s[i*3+2] - firstvertex;
4748 e[0] = element3i[i*3+0] - firstvertex;
4749 e[1] = element3i[i*3+1] - firstvertex;
4750 e[2] = element3i[i*3+2] - firstvertex;
4759 #define SKIPBACKFACE \
4760 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4761 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4762 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4763 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4764 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4768 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4772 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4777 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4778 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4780 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4781 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4783 #define CLIPPEDVERTEXCOPY(k,p1) \
4784 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4786 #define GENATTRIBCOPY(attrib, p1) \
4787 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4788 #define GENATTRIBLERP(attrib, p1, p2) \
4790 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4791 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4793 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4797 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4800 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4801 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4809 // calculate distance from nearplane
4810 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4811 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4812 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4813 if (clipdist[0] >= 0.0f)
4815 if (clipdist[1] >= 0.0f)
4817 if (clipdist[2] >= 0.0f)
4820 // triangle is entirely in front of nearplane
4821 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4828 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4836 if (clipdist[2] >= 0.0f)
4838 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4845 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4852 else if (clipdist[1] >= 0.0f)
4854 if (clipdist[2] >= 0.0f)
4856 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4863 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4869 else if (clipdist[2] >= 0.0f)
4871 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4876 else continue; // triangle is entirely behind nearplane
4879 // calculate integer y coords for triangle points
4880 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4881 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4882 screenmin = _mm_min_epi16(screeni, screenir),
4883 screenmax = _mm_max_epi16(screeni, screenir);
4884 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4885 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4886 screenmin = _mm_max_epi16(screenmin, fbmin);
4887 screenmax = _mm_min_epi16(screenmax, fbmax);
4888 // skip offscreen triangles
4889 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4891 starty = _mm_extract_epi16(screenmin, 1);
4892 endy = _mm_extract_epi16(screenmax, 1)+1;
4893 if (starty >= maxy1 && endy <= miny2)
4895 screeny = _mm_srai_epi32(screeni, 16);
4898 triangle = &thread->triangles[thread->numtriangles];
4900 // calculate attribute plans for triangle data...
4901 // okay, this triangle is going to produce spans, we'd better project
4902 // the interpolants now (this is what gives perspective texturing),
4903 // this consists of simply multiplying all arrays by the W coord
4904 // (which is basically 1/Z), which will be undone per-pixel
4905 // (multiplying by Z again) to get the perspective-correct array
4908 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4909 __m128 mipedgescale, mipdensity;
4910 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4911 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4912 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4913 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4914 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4915 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4916 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4917 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4918 attribedge1 = _mm_sub_ss(w0, w1);
4919 attribedge2 = _mm_sub_ss(w2, w1);
4920 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4921 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4922 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4923 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4924 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4925 _mm_store_ss(&triangle->w[0], attribxslope);
4926 _mm_store_ss(&triangle->w[1], attribyslope);
4927 _mm_store_ss(&triangle->w[2], attriborigin);
4928 mipedgescale = _mm_setzero_ps();
4929 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4931 __m128 attrib0, attrib1, attrib2;
4932 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4933 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4935 arrays += numvertices*4;
4936 GENATTRIBS(attrib0, attrib1, attrib2);
4937 attriborigin = _mm_mul_ps(attrib1, w1);
4938 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4939 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4940 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4941 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4942 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4943 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4944 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4945 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4946 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4948 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4949 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4950 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4951 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4955 memset(triangle->mip, 0, sizeof(triangle->mip));
4956 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4958 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4959 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4961 texture = thread->texbound[texunit];
4962 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4964 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4965 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4966 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4967 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4968 // this will be multiplied in the texturing routine by the texture resolution
4969 y = _mm_cvtss_si32(mipdensity);
4972 y = (int)(log((float)y)*0.5f/M_LN2);
4973 if (y > texture->mipmaps - 1)
4974 y = texture->mipmaps - 1;
4975 triangle->mip[texunit] = y;
4981 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4984 __m128 xcoords, xslope;
4985 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4986 int yccmask = _mm_movemask_epi8(ycc);
4987 int edge0p, edge0n, edge1p, edge1n;
4994 case 0xFFFF: /*0000*/ y = endy; continue;
4995 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4996 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4997 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4998 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4999 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5000 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5001 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5002 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5003 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5004 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5005 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5006 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5007 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5008 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5009 case 0x0000: /*1111*/ y++; continue;
5017 case 0xFFFF: /*000*/ y = endy; continue;
5018 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5019 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5020 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5021 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5022 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5023 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5024 case 0x0000: /*111*/ y++; continue;
5027 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5028 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5029 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5030 nexty = _mm_extract_epi16(ycc, 0);
5031 if (nexty >= bandy) nexty = bandy-1;
5032 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5033 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5034 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5035 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5036 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5037 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5039 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5040 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5042 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5044 int startx, endx, offset;
5045 startx = _mm_cvtss_si32(xcoords);
5046 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5049 if (startx < 0) startx = 0;
5050 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5052 if (endx > maxx) endx = maxx;
5053 if (startx >= endx) continue;
5054 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5056 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5057 span->triangle = thread->numtriangles;
5060 span->startx = max(minx - offset, 0);
5061 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5062 if (span->startx >= span->endx)
5064 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5065 DPSOFTRAST_Draw_ProcessSpans(thread);
5070 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5072 DPSOFTRAST_Draw_ProcessSpans(thread);
5073 thread->numtriangles = 0;
5077 if (!ATOMIC_DECREMENT(command->refcount))
5079 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5080 MM_FREE(command->arrays);
5083 if (thread->numspans > 0 || thread->numtriangles > 0)
5085 DPSOFTRAST_Draw_ProcessSpans(thread);
5086 thread->numtriangles = 0;
5091 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5095 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5096 int datasize = 2*numvertices*sizeof(float[4]);
5097 DPSOFTRAST_Command_Draw *command;
5098 unsigned char *data;
5099 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5101 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5102 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5104 datasize += numvertices*sizeof(float[4]);
5107 datasize += numtriangles*sizeof(unsigned short[3]);
5109 datasize += numtriangles*sizeof(int[3]);
5110 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5111 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5113 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5114 data = (unsigned char *)MM_CALLOC(datasize, 1);
5118 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5119 data = (unsigned char *)command + commandsize;
5121 command->firstvertex = firstvertex;
5122 command->numvertices = numvertices;
5123 command->numtriangles = numtriangles;
5124 command->arrays = (float *)data;
5125 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5126 dpsoftrast.firstvertex = firstvertex;
5127 dpsoftrast.numvertices = numvertices;
5128 dpsoftrast.screencoord4f = (float *)data;
5129 data += numvertices*sizeof(float[4]);
5130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5131 data += numvertices*sizeof(float[4]);
5132 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5134 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5135 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5137 dpsoftrast.post_array4f[j] = (float *)data;
5138 data += numvertices*sizeof(float[4]);
5140 command->element3i = NULL;
5141 command->element3s = NULL;
5144 command->element3s = (unsigned short *)data;
5145 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5149 command->element3i = (int *)data;
5150 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5155 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5157 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5158 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5159 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5160 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5161 if (command->starty >= command->endy)
5163 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5164 MM_FREE(command->arrays);
5165 DPSOFTRAST_UndoCommand(command->commandsize);
5168 command->clipped = dpsoftrast.drawclipped;
5169 command->refcount = dpsoftrast.numthreads;
5171 if (dpsoftrast.usethreads)
5174 DPSOFTRAST_Draw_SyncCommands();
5175 for (i = 0; i < dpsoftrast.numthreads; i++)
5177 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5178 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5179 Thread_CondSignal(thread->drawcond);
5184 DPSOFTRAST_Draw_FlushThreads();
5188 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5189 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5191 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5193 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5195 DPSOFTRAST_Command_SetRenderTargets *command;
5196 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5197 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5198 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5200 dpsoftrast.fb_width = width;
5201 dpsoftrast.fb_height = height;
5202 dpsoftrast.fb_depthpixels = depthpixels;
5203 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5204 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5205 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5206 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5207 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5208 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5209 command->width = width;
5210 command->height = height;
5213 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5215 int commandoffset = thread->commandoffset;
5216 while (commandoffset != endoffset)
5218 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5219 switch (command->opcode)
5221 #define INTERPCOMMAND(name) \
5222 case DPSOFTRAST_OPCODE_##name : \
5223 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5224 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5225 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5226 commandoffset = 0; \
5228 INTERPCOMMAND(Viewport)
5229 INTERPCOMMAND(ClearColor)
5230 INTERPCOMMAND(ClearDepth)
5231 INTERPCOMMAND(ColorMask)
5232 INTERPCOMMAND(DepthTest)
5233 INTERPCOMMAND(ScissorTest)
5234 INTERPCOMMAND(Scissor)
5235 INTERPCOMMAND(BlendFunc)
5236 INTERPCOMMAND(BlendSubtract)
5237 INTERPCOMMAND(DepthMask)
5238 INTERPCOMMAND(DepthFunc)
5239 INTERPCOMMAND(DepthRange)
5240 INTERPCOMMAND(PolygonOffset)
5241 INTERPCOMMAND(CullFace)
5242 INTERPCOMMAND(AlphaTest)
5243 INTERPCOMMAND(AlphaFunc)
5244 INTERPCOMMAND(SetTexture)
5245 INTERPCOMMAND(SetShader)
5246 INTERPCOMMAND(Uniform4f)
5247 INTERPCOMMAND(UniformMatrix4f)
5248 INTERPCOMMAND(Uniform1i)
5249 INTERPCOMMAND(SetRenderTargets)
5251 case DPSOFTRAST_OPCODE_Draw:
5252 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5253 commandoffset += command->commandsize;
5254 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5256 thread->commandoffset = commandoffset;
5259 case DPSOFTRAST_OPCODE_Reset:
5264 thread->commandoffset = commandoffset;
5267 static int DPSOFTRAST_Draw_Thread(void *data)
5269 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5270 while(thread->index >= 0)
5272 if (thread->commandoffset != dpsoftrast.drawcommand)
5274 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5278 Thread_LockMutex(thread->drawmutex);
5279 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5281 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5282 thread->starving = true;
5283 Thread_CondWait(thread->drawcond, thread->drawmutex);
5284 thread->starving = false;
5286 Thread_UnlockMutex(thread->drawmutex);
5292 static void DPSOFTRAST_Draw_FlushThreads(void)
5294 DPSOFTRAST_State_Thread *thread;
5296 DPSOFTRAST_Draw_SyncCommands();
5297 if (dpsoftrast.usethreads)
5299 for (i = 0; i < dpsoftrast.numthreads; i++)
5301 thread = &dpsoftrast.threads[i];
5302 if (thread->commandoffset != dpsoftrast.drawcommand)
5304 Thread_LockMutex(thread->drawmutex);
5305 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5306 Thread_CondSignal(thread->drawcond);
5307 Thread_UnlockMutex(thread->drawmutex);
5310 for (i = 0; i < dpsoftrast.numthreads; i++)
5312 thread = &dpsoftrast.threads[i];
5313 if (thread->commandoffset != dpsoftrast.drawcommand)
5315 Thread_LockMutex(thread->drawmutex);
5316 if (thread->commandoffset != dpsoftrast.drawcommand)
5318 thread->waiting = true;
5319 Thread_CondWait(thread->waitcond, thread->drawmutex);
5320 thread->waiting = false;
5322 Thread_UnlockMutex(thread->drawmutex);
5328 for (i = 0; i < dpsoftrast.numthreads; i++)
5330 thread = &dpsoftrast.threads[i];
5331 if (thread->commandoffset != dpsoftrast.drawcommand)
5332 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5335 dpsoftrast.commandpool.usedcommands = 0;
5338 void DPSOFTRAST_Flush(void)
5340 DPSOFTRAST_Draw_FlushThreads();
5343 void DPSOFTRAST_Finish(void)
5348 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5358 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5359 dpsoftrast.bigendian = u.b[3];
5360 dpsoftrast.fb_width = width;
5361 dpsoftrast.fb_height = height;
5362 dpsoftrast.fb_depthpixels = depthpixels;
5363 dpsoftrast.fb_colorpixels[0] = colorpixels;
5364 dpsoftrast.fb_colorpixels[1] = NULL;
5365 dpsoftrast.fb_colorpixels[1] = NULL;
5366 dpsoftrast.fb_colorpixels[1] = NULL;
5367 dpsoftrast.viewport[0] = 0;
5368 dpsoftrast.viewport[1] = 0;
5369 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5370 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5371 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5372 dpsoftrast.texture_firstfree = 1;
5373 dpsoftrast.texture_end = 1;
5374 dpsoftrast.texture_max = 0;
5375 dpsoftrast.color[0] = 1;
5376 dpsoftrast.color[1] = 1;
5377 dpsoftrast.color[2] = 1;
5378 dpsoftrast.color[3] = 1;
5379 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5380 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5381 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5382 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5383 for (i = 0; i < dpsoftrast.numthreads; i++)
5385 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5387 thread->cullface = GL_BACK;
5388 thread->colormask[1] = 1;
5389 thread->colormask[2] = 1;
5390 thread->colormask[3] = 1;
5391 thread->blendfunc[0] = GL_ONE;
5392 thread->blendfunc[1] = GL_ZERO;
5393 thread->depthmask = true;
5394 thread->depthtest = true;
5395 thread->depthfunc = GL_LEQUAL;
5396 thread->scissortest = false;
5397 thread->alphatest = false;
5398 thread->alphafunc = GL_GREATER;
5399 thread->alphavalue = 0.5f;
5400 thread->viewport[0] = 0;
5401 thread->viewport[1] = 0;
5402 thread->viewport[2] = dpsoftrast.fb_width;
5403 thread->viewport[3] = dpsoftrast.fb_height;
5404 thread->scissor[0] = 0;
5405 thread->scissor[1] = 0;
5406 thread->scissor[2] = dpsoftrast.fb_width;
5407 thread->scissor[3] = dpsoftrast.fb_height;
5408 thread->depthrange[0] = 0;
5409 thread->depthrange[1] = 1;
5410 thread->polygonoffset[0] = 0;
5411 thread->polygonoffset[1] = 0;
5413 DPSOFTRAST_RecalcThread(thread);
5415 thread->numspans = 0;
5416 thread->numtriangles = 0;
5417 thread->commandoffset = 0;
5418 thread->waiting = false;
5419 thread->starving = false;
5421 thread->validate = -1;
5422 DPSOFTRAST_Validate(thread, -1);
5424 if (dpsoftrast.usethreads)
5426 thread->waitcond = Thread_CreateCond();
5427 thread->drawcond = Thread_CreateCond();
5428 thread->drawmutex = Thread_CreateMutex();
5429 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5435 void DPSOFTRAST_Shutdown(void)
5438 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5440 DPSOFTRAST_State_Thread *thread;
5441 for (i = 0; i < dpsoftrast.numthreads; i++)
5443 thread = &dpsoftrast.threads[i];
5444 Thread_LockMutex(thread->drawmutex);
5446 Thread_CondSignal(thread->drawcond);
5447 Thread_UnlockMutex(thread->drawmutex);
5448 Thread_WaitThread(thread->thread, 0);
5449 Thread_DestroyCond(thread->waitcond);
5450 Thread_DestroyCond(thread->drawcond);
5451 Thread_DestroyMutex(thread->drawmutex);
5454 for (i = 0;i < dpsoftrast.texture_end;i++)
5455 if (dpsoftrast.texture[i].bytes)
5456 MM_FREE(dpsoftrast.texture[i].bytes);
5457 if (dpsoftrast.texture)
5458 free(dpsoftrast.texture);
5459 if (dpsoftrast.threads)
5460 MM_FREE(dpsoftrast.threads);
5461 memset(&dpsoftrast, 0, sizeof(dpsoftrast));