3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6)
77 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
82 static void *MM_CALLOC(size_t nmemb, size_t size)
84 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85 if (ptr != NULL) memset(ptr, 0, nmemb*size);
89 #define MM_FREE _mm_free
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
96 typedef enum DPSOFTRAST_ARRAY_e
98 DPSOFTRAST_ARRAY_POSITION,
99 DPSOFTRAST_ARRAY_COLOR,
100 DPSOFTRAST_ARRAY_TEXCOORD0,
101 DPSOFTRAST_ARRAY_TEXCOORD1,
102 DPSOFTRAST_ARRAY_TEXCOORD2,
103 DPSOFTRAST_ARRAY_TEXCOORD3,
104 DPSOFTRAST_ARRAY_TEXCOORD4,
105 DPSOFTRAST_ARRAY_TEXCOORD5,
106 DPSOFTRAST_ARRAY_TEXCOORD6,
107 DPSOFTRAST_ARRAY_TEXCOORD7,
108 DPSOFTRAST_ARRAY_TOTAL
112 typedef struct DPSOFTRAST_Texture_s
119 DPSOFTRAST_TEXTURE_FILTER filter;
122 ATOMIC_COUNTER binds;
123 unsigned char *bytes;
124 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
133 unsigned char opcode;
134 unsigned short commandsize;
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
140 #define DEFCOMMAND(opcodeval, name, fields) \
141 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
144 unsigned char opcode; \
145 unsigned short commandsize; \
147 } DPSOFTRAST_Command_##name );
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
156 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
158 DPSOFTRAST_State_Command_Pool);
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
162 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
164 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
166 DPSOFTRAST_State_Triangle);
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
189 int triangle; // triangle this span was generated by
190 int x; // framebuffer x coord
191 int y; // framebuffer y coord
192 int startx; // usable range (according to pixelmask)
193 int endx; // usable range (according to pixelmask)
194 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196 int depthslope; // depthbuffer value pixel delta
198 DPSOFTRAST_State_Span);
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
209 typedef enum DPSOFTRAST_BLENDMODE_e
211 DPSOFTRAST_BLENDMODE_OPAQUE,
212 DPSOFTRAST_BLENDMODE_ALPHA,
213 DPSOFTRAST_BLENDMODE_ADDALPHA,
214 DPSOFTRAST_BLENDMODE_ADD,
215 DPSOFTRAST_BLENDMODE_INVMOD,
216 DPSOFTRAST_BLENDMODE_MUL,
217 DPSOFTRAST_BLENDMODE_MUL2,
218 DPSOFTRAST_BLENDMODE_SUBALPHA,
219 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220 DPSOFTRAST_BLENDMODE_INVADD,
221 DPSOFTRAST_BLENDMODE_TOTAL
223 DPSOFTRAST_BLENDMODE;
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
244 float polygonoffset[2];
246 ALIGN(float fb_clipplane[4]);
249 int shader_permutation;
250 int shader_exactspecularmath;
252 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
254 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
257 // DPSOFTRAST_VALIDATE_ flags
260 // derived values (DPSOFTRAST_VALIDATE_FB)
263 ALIGN(float fb_viewportcenter[4]);
264 ALIGN(float fb_viewportscale[4]);
266 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
269 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
278 ATOMIC(volatile int commandoffset);
280 volatile bool waiting;
281 volatile bool starving;
288 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
292 DPSOFTRAST_State_Thread);
294 typedef ATOMIC(struct DPSOFTRAST_State_s
298 unsigned int *fb_depthpixels;
299 unsigned int *fb_colorpixels[4];
302 ALIGN(float fb_viewportcenter[4]);
303 ALIGN(float fb_viewportscale[4]);
306 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
309 const float *pointer_vertex3f;
310 const float *pointer_color4f;
311 const unsigned char *pointer_color4ub;
312 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
315 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
321 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322 float *screencoord4f;
328 int shader_permutation;
329 int shader_exactspecularmath;
333 int texture_firstfree;
334 DPSOFTRAST_Texture *texture;
339 const char *errorstring;
344 DPSOFTRAST_State_Thread *threads;
346 ATOMIC(volatile int drawcommand);
348 DPSOFTRAST_State_Command_Pool commandpool;
352 DPSOFTRAST_State dpsoftrast;
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
364 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366 fb_viewportcenter[3] = 0.5f;
367 fb_viewportcenter[0] = 0.0f;
368 fb_viewportscale[1] = 0.5f * viewport[2];
369 fb_viewportscale[2] = -0.5f * viewport[3];
370 fb_viewportscale[3] = 0.5f;
371 fb_viewportscale[0] = 1.0f;
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
376 if (dpsoftrast.interlace)
378 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
385 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
392 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
401 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402 // and viewport projection values
405 x1 = thread->scissor[0];
406 x2 = thread->scissor[0] + thread->scissor[2];
407 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408 y2 = dpsoftrast.fb_height - thread->scissor[1];
409 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
411 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
413 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414 thread->fb_scissor[0] = x1;
415 thread->fb_scissor[1] = y1;
416 thread->fb_scissor[2] = x2 - x1;
417 thread->fb_scissor[3] = y2 - y1;
419 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420 DPSOFTRAST_RecalcClipPlane(thread);
421 DPSOFTRAST_RecalcThread(thread);
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
426 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
431 if (thread->blendsubtract)
433 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
435 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
443 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
464 mask &= thread->validate;
467 if (mask & DPSOFTRAST_VALIDATE_FB)
469 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470 DPSOFTRAST_RecalcFB(thread);
472 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
474 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475 DPSOFTRAST_RecalcDepthFunc(thread);
477 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
479 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480 DPSOFTRAST_RecalcBlendFunc(thread);
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
486 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487 return &dpsoftrast.texture[index];
491 static void DPSOFTRAST_Texture_Grow(void)
493 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494 DPSOFTRAST_State_Thread *thread;
498 // expand texture array as needed
499 if (dpsoftrast.texture_max < 1024)
500 dpsoftrast.texture_max = 1024;
502 dpsoftrast.texture_max *= 2;
503 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505 if (dpsoftrast.texbound[i])
506 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507 for (j = 0; j < dpsoftrast.numthreads; j++)
509 thread = &dpsoftrast.threads[j];
510 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511 if (thread->texbound[i])
512 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
525 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527 DPSOFTRAST_Texture *texture;
528 if (width*height*depth < 1)
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
533 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
540 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
544 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
562 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
564 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
567 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
572 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
577 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
582 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
584 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
587 // find first empty slot in texture array
588 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589 if (!dpsoftrast.texture[texnum].bytes)
591 dpsoftrast.texture_firstfree = texnum + 1;
592 if (dpsoftrast.texture_max <= texnum)
593 DPSOFTRAST_Texture_Grow();
594 if (dpsoftrast.texture_end <= texnum)
595 dpsoftrast.texture_end = texnum + 1;
596 texture = &dpsoftrast.texture[texnum];
597 memset(texture, 0, sizeof(*texture));
598 texture->flags = flags;
599 texture->width = width;
600 texture->height = height;
601 texture->depth = depth;
602 texture->sides = sides;
614 s = w * h * d * sides * 4;
615 texture->mipmap[mipmaps][0] = size;
616 texture->mipmap[mipmaps][1] = s;
617 texture->mipmap[mipmaps][2] = w;
618 texture->mipmap[mipmaps][3] = h;
619 texture->mipmap[mipmaps][4] = d;
622 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
628 texture->mipmaps = mipmaps;
629 texture->size = size;
631 // allocate the pixels now
632 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
636 void DPSOFTRAST_Texture_Free(int index)
638 DPSOFTRAST_Texture *texture;
639 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
643 MM_FREE(texture->bytes);
644 texture->bytes = NULL;
645 memset(texture, 0, sizeof(*texture));
646 // adjust the free range and used range
647 if (dpsoftrast.texture_firstfree > index)
648 dpsoftrast.texture_firstfree = index;
649 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650 dpsoftrast.texture_end--;
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
654 int i, x, y, z, w, layer0, layer1, row0, row1;
655 unsigned char *o, *i0, *i1, *i2, *i3;
656 DPSOFTRAST_Texture *texture;
657 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658 if (texture->mipmaps <= 1)
660 for (i = 1;i < texture->mipmaps;i++)
662 for (z = 0;z < texture->mipmap[i][4];z++)
666 if (layer1 >= texture->mipmap[i-1][4])
667 layer1 = texture->mipmap[i-1][4]-1;
668 for (y = 0;y < texture->mipmap[i][3];y++)
672 if (row1 >= texture->mipmap[i-1][3])
673 row1 = texture->mipmap[i-1][3]-1;
674 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
675 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679 w = texture->mipmap[i][2];
682 if (texture->mipmap[i-1][2] > 1)
684 // average 3D texture
685 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
687 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
695 // average 3D mipmap with parent width == 1
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
698 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
707 if (texture->mipmap[i-1][2] > 1)
709 // average 2D texture (common case)
710 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
712 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
720 // 2D texture with parent width == 1
721 o[0] = (i0[0] + i1[0] + 1) >> 1;
722 o[1] = (i0[1] + i1[1] + 1) >> 1;
723 o[2] = (i0[2] + i1[2] + 1) >> 1;
724 o[3] = (i0[3] + i1[3] + 1) >> 1;
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
733 DPSOFTRAST_Texture *texture;
735 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
740 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741 while (blockheight > 0)
743 memcpy(dst, pixels, blockwidth * 4);
744 pixels += blockwidth * 4;
745 dst += texture->mipmap[0][2] * 4;
749 DPSOFTRAST_Texture_CalculateMipmaps(index);
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
758 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759 DPSOFTRAST_Texture_CalculateMipmaps(index);
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
763 DPSOFTRAST_Texture *texture;
764 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765 return texture->mipmap[mip][2];
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
769 DPSOFTRAST_Texture *texture;
770 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771 return texture->mipmap[mip][3];
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
775 DPSOFTRAST_Texture *texture;
776 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777 return texture->mipmap[mip][4];
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
781 DPSOFTRAST_Texture *texture;
782 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->bytes + texture->mipmap[mip][0];
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
793 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
798 texture->filter = filter;
801 static void DPSOFTRAST_Draw_FlushThreads(void);
803 static void DPSOFTRAST_Draw_SyncCommands(void)
805 if(dpsoftrast.usethreads) MEMORY_BARRIER;
806 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
811 DPSOFTRAST_State_Thread *thread;
813 int freecommand = dpsoftrast.commandpool.freecommand;
814 int usedcommands = dpsoftrast.commandpool.usedcommands;
815 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
817 DPSOFTRAST_Draw_SyncCommands();
823 for (i = 0; i < dpsoftrast.numthreads; i++)
825 thread = &dpsoftrast.threads[i];
826 commandoffset = freecommand - thread->commandoffset;
827 if (commandoffset < 0)
828 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829 if (commandoffset > usedcommands)
832 usedcommands = commandoffset;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
837 thread = &dpsoftrast.threads[waitindex];
838 Thread_LockMutex(thread->drawmutex);
839 if (thread->commandoffset != dpsoftrast.drawcommand)
841 thread->waiting = true;
842 if (thread->starving) Thread_CondSignal(thread->drawcond);
843 Thread_CondWait(thread->waitcond, thread->drawmutex);
844 thread->waiting = false;
846 Thread_UnlockMutex(thread->drawmutex);
848 dpsoftrast.commandpool.usedcommands = usedcommands;
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
858 DPSOFTRAST_Command *command;
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 int extra = sizeof(DPSOFTRAST_Command);
862 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
866 if (dpsoftrast.usethreads)
867 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
869 DPSOFTRAST_Draw_FlushThreads();
870 freecommand = dpsoftrast.commandpool.freecommand;
871 usedcommands = dpsoftrast.commandpool.usedcommands;
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
875 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876 command->opcode = DPSOFTRAST_OPCODE_Reset;
877 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
880 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881 command->opcode = opcode;
882 command->commandsize = size;
884 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
886 dpsoftrast.commandpool.freecommand = freecommand;
887 dpsoftrast.commandpool.usedcommands = usedcommands + size;
891 static void DPSOFTRAST_UndoCommand(int size)
893 int freecommand = dpsoftrast.commandpool.freecommand;
894 int usedcommands = dpsoftrast.commandpool.usedcommands;
897 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898 usedcommands -= size;
899 dpsoftrast.commandpool.freecommand = freecommand;
900 dpsoftrast.commandpool.usedcommands = usedcommands;
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
906 thread->viewport[0] = command->x;
907 thread->viewport[1] = command->y;
908 thread->viewport[2] = command->width;
909 thread->viewport[3] = command->height;
910 thread->validate |= DPSOFTRAST_VALIDATE_FB;
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
914 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
917 command->width = width;
918 command->height = height;
920 dpsoftrast.viewport[0] = x;
921 dpsoftrast.viewport[1] = y;
922 dpsoftrast.viewport[2] = width;
923 dpsoftrast.viewport[3] = height;
924 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
930 int i, x1, y1, x2, y2, w, h, x, y;
931 int miny1, maxy1, miny2, maxy2;
935 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936 miny1 = thread->miny1;
937 maxy1 = thread->maxy1;
938 miny2 = thread->miny2;
939 maxy2 = thread->maxy2;
940 x1 = thread->fb_scissor[0];
941 y1 = thread->fb_scissor[1];
942 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944 if (y1 < miny1) y1 = miny1;
945 if (y2 > maxy2) y2 = maxy2;
950 // FIXME: honor fb_colormask?
951 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952 for (i = 0;i < 4;i++)
954 if (!dpsoftrast.fb_colorpixels[i])
956 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
959 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960 for (x = x1;x < x2;x++)
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
967 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
977 int x1, y1, x2, y2, w, h, x, y;
978 int miny1, maxy1, miny2, maxy2;
982 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983 miny1 = thread->miny1;
984 maxy1 = thread->maxy1;
985 miny2 = thread->miny2;
986 maxy2 = thread->maxy2;
987 x1 = thread->fb_scissor[0];
988 y1 = thread->fb_scissor[1];
989 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991 if (y1 < miny1) y1 = miny1;
992 if (y2 > maxy2) y2 = maxy2;
997 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1001 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002 for (x = x1;x < x2;x++)
1006 void DPSOFTRAST_ClearDepth(float d)
1008 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1015 thread->colormask[0] = command->r != 0;
1016 thread->colormask[1] = command->g != 0;
1017 thread->colormask[2] = command->b != 0;
1018 thread->colormask[3] = command->a != 0;
1019 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1023 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1033 thread->depthtest = command->enable;
1034 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1036 void DPSOFTRAST_DepthTest(int enable)
1038 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039 command->enable = enable;
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1045 thread->scissortest = command->enable;
1046 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1048 void DPSOFTRAST_ScissorTest(int enable)
1050 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051 command->enable = enable;
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1057 thread->scissor[0] = command->x;
1058 thread->scissor[1] = command->y;
1059 thread->scissor[2] = command->width;
1060 thread->scissor[3] = command->height;
1061 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1065 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1068 command->width = width;
1069 command->height = height;
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1075 thread->blendfunc[0] = command->sfactor;
1076 thread->blendfunc[1] = command->dfactor;
1077 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1081 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082 command->sfactor = sfactor;
1083 command->dfactor = dfactor;
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1089 thread->blendsubtract = command->enable;
1090 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1092 void DPSOFTRAST_BlendSubtract(int enable)
1094 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095 command->enable = enable;
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1101 thread->depthmask = command->enable;
1103 void DPSOFTRAST_DepthMask(int enable)
1105 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106 command->enable = enable;
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1112 thread->depthfunc = command->func;
1114 void DPSOFTRAST_DepthFunc(int func)
1116 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117 command->func = func;
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1123 thread->depthrange[0] = command->nearval;
1124 thread->depthrange[1] = command->farval;
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1128 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129 command->nearval = nearval;
1130 command->farval = farval;
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1136 thread->polygonoffset[0] = command->alongnormal;
1137 thread->polygonoffset[1] = command->intoview;
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1141 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142 command->alongnormal = alongnormal;
1143 command->intoview = intoview;
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1149 thread->cullface = command->mode;
1151 void DPSOFTRAST_CullFace(int mode)
1153 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154 command->mode = mode;
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1160 thread->alphatest = command->enable;
1162 void DPSOFTRAST_AlphaTest(int enable)
1164 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165 command->enable = enable;
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1171 thread->alphafunc = command->func;
1172 thread->alphavalue = command->ref;
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1176 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177 command->func = func;
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1183 dpsoftrast.color[0] = r;
1184 dpsoftrast.color[1] = g;
1185 dpsoftrast.color[2] = b;
1186 dpsoftrast.color[3] = a;
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1191 int outstride = blockwidth * 4;
1192 int instride = dpsoftrast.fb_width * 4;
1195 int bx2 = blockx + blockwidth;
1196 int by2 = blocky + blockheight;
1200 unsigned char *inpixels;
1204 if (bx1 < 0) bx1 = 0;
1205 if (by1 < 0) by1 = 0;
1206 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1209 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210 if (dpsoftrast.bigendian)
1212 for (y = by1;y < by2;y++)
1214 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 for (x = bx1;x < bx2;x++)
1229 for (y = by1;y < by2;y++)
1231 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232 o = (unsigned char *)outpixels + (y - by1) * outstride;
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1242 int tx2 = tx + width;
1243 int ty2 = ty + height;
1246 int sx2 = sx + width;
1247 int sy2 = sy + height;
1257 unsigned int *spixels;
1258 unsigned int *tpixels;
1259 DPSOFTRAST_Texture *texture;
1260 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261 if (mip < 0 || mip >= texture->mipmaps) return;
1263 spixels = dpsoftrast.fb_colorpixels[0];
1264 swidth = dpsoftrast.fb_width;
1265 sheight = dpsoftrast.fb_height;
1266 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267 twidth = texture->mipmap[mip][2];
1268 theight = texture->mipmap[mip][3];
1269 if (tx1 < 0) tx1 = 0;
1270 if (ty1 < 0) ty1 = 0;
1271 if (tx2 > twidth) tx2 = twidth;
1272 if (ty2 > theight) ty2 = theight;
1273 if (sx1 < 0) sx1 = 0;
1274 if (sy1 < 0) sy1 = 0;
1275 if (sx2 > swidth) sx2 = swidth;
1276 if (sy2 > sheight) sy2 = sheight;
1281 if (tw > sw) tw = sw;
1282 if (th > sh) th = sh;
1283 if (tw < 1 || th < 1)
1285 sy1 = sheight - 1 - sy1;
1286 for (y = 0;y < th;y++)
1287 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288 if (texture->mipmaps > 1)
1289 DPSOFTRAST_Texture_CalculateMipmaps(index);
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1295 if (thread->texbound[command->unitnum])
1296 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297 thread->texbound[command->unitnum] = command->texture;
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1301 DPSOFTRAST_Command_SetTexture *command;
1302 DPSOFTRAST_Texture *texture;
1303 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1305 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1308 texture = DPSOFTRAST_Texture_GetByIndex(index);
1309 if (index && !texture)
1311 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1315 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316 command->unitnum = unitnum;
1317 command->texture = texture;
1319 dpsoftrast.texbound[unitnum] = texture;
1320 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1325 dpsoftrast.pointer_vertex3f = vertex3f;
1326 dpsoftrast.stride_vertex = stride;
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1330 dpsoftrast.pointer_color4f = color4f;
1331 dpsoftrast.pointer_color4ub = NULL;
1332 dpsoftrast.stride_color = stride;
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1336 dpsoftrast.pointer_color4f = NULL;
1337 dpsoftrast.pointer_color4ub = color4ub;
1338 dpsoftrast.stride_color = stride;
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1342 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344 dpsoftrast.stride_texcoord[unitnum] = stride;
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1350 thread->shader_mode = command->mode;
1351 thread->shader_permutation = command->permutation;
1352 thread->shader_exactspecularmath = command->exactspecularmath;
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1356 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357 command->mode = mode;
1358 command->permutation = permutation;
1359 command->exactspecularmath = exactspecularmath;
1361 dpsoftrast.shader_mode = mode;
1362 dpsoftrast.shader_permutation = permutation;
1363 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1369 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1373 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374 command->index = index;
1375 command->val[0] = v0;
1376 command->val[1] = v1;
1377 command->val[2] = v2;
1378 command->val[3] = v3;
1380 dpsoftrast.uniform4f[index*4+0] = v0;
1381 dpsoftrast.uniform4f[index*4+1] = v1;
1382 dpsoftrast.uniform4f[index*4+2] = v2;
1383 dpsoftrast.uniform4f[index*4+3] = v3;
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1387 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388 command->index = index;
1389 memcpy(command->val, v, sizeof(command->val));
1391 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1397 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1403 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1405 __m128 m0, m1, m2, m3;
1406 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407 command->index = (DPSOFTRAST_UNIFORM)index;
1408 if (((size_t)v)&(ALIGN_SIZE-1))
1410 m0 = _mm_loadu_ps(v);
1411 m1 = _mm_loadu_ps(v+4);
1412 m2 = _mm_loadu_ps(v+8);
1413 m3 = _mm_loadu_ps(v+12);
1417 m0 = _mm_load_ps(v);
1418 m1 = _mm_load_ps(v+4);
1419 m2 = _mm_load_ps(v+8);
1420 m3 = _mm_load_ps(v+12);
1424 __m128 t0, t1, t2, t3;
1425 t0 = _mm_unpacklo_ps(m0, m1);
1426 t1 = _mm_unpacklo_ps(m2, m3);
1427 t2 = _mm_unpackhi_ps(m0, m1);
1428 t3 = _mm_unpackhi_ps(m2, m3);
1429 m0 = _mm_movelh_ps(t0, t1);
1430 m1 = _mm_movehl_ps(t1, t0);
1431 m2 = _mm_movelh_ps(t2, t3);
1432 m3 = _mm_movehl_ps(t3, t2);
1434 _mm_store_ps(command->val, m0);
1435 _mm_store_ps(command->val+4, m1);
1436 _mm_store_ps(command->val+8, m2);
1437 _mm_store_ps(command->val+12, m3);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1449 thread->uniform1i[command->index] = command->val;
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1453 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454 command->index = index;
1457 dpsoftrast.uniform1i[command->index] = i0;
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1463 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1468 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469 command->clipplane[0] = x;
1470 command->clipplane[1] = y;
1471 command->clipplane[2] = z;
1472 command->clipplane[3] = w;
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1478 float *end = dst + size*4;
1479 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1483 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1492 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1501 float *end = dst + size*4;
1502 if (stride == sizeof(float[3]))
1504 float *end4 = dst + (size&~3)*4;
1505 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1509 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1510 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1523 src += 4*sizeof(float[3]);
1530 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1544 src += 4*sizeof(float[3]);
1548 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1552 __m128 v = _mm_loadu_ps((const float *)src);
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556 _mm_store_ps(dst, v);
1565 __m128 v = _mm_load_ps((const float *)src);
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569 _mm_store_ps(dst, v);
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 float *end = dst + size*4;
1579 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580 if (stride == sizeof(float[2]))
1582 float *end2 = dst + (size&~1)*4;
1583 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1587 __m128 v = _mm_loadu_ps((const float *)src);
1588 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1591 src += 2*sizeof(float[2]);
1598 __m128 v = _mm_load_ps((const float *)src);
1599 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1602 src += 2*sizeof(float[2]);
1608 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1616 float *end = dst + size*4;
1617 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618 if (stride == sizeof(unsigned char[4]))
1620 float *end4 = dst + (size&~3)*4;
1621 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1625 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631 src += 4*sizeof(unsigned char[4]);
1638 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1644 src += 4*sizeof(unsigned char[4]);
1650 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1659 float *end = dst + 4*size;
1660 __m128 v = _mm_loadu_ps(src);
1663 _mm_store_ps(dst, v);
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1672 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673 __m128 m0, m1, m2, m3;
1675 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1677 // fast case for identity matrix
1678 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1681 end = out4f + numitems*4;
1682 m0 = _mm_loadu_ps(inmatrix16f);
1683 m1 = _mm_loadu_ps(inmatrix16f + 4);
1684 m2 = _mm_loadu_ps(inmatrix16f + 8);
1685 m3 = _mm_loadu_ps(inmatrix16f + 12);
1686 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1690 __m128 v = _mm_loadu_ps(in4f);
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1704 __m128 v = _mm_load_ps(in4f);
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1719 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2062 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063 pixeli += span->y * dpsoftrast.fb_width + span->x;
2064 // handle alphatest now (this affects depth writes too)
2065 if (thread->alphatest)
2066 for (x = startx;x < endx;x++)
2067 if (in4ub[x*4+3] < 128)
2068 pixelmask[x] = false;
2069 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070 // helps sprites, text and hud artwork
2071 switch(thread->fb_blendmode)
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2077 for (x = startx;x < endx;x++)
2079 if (in4ub[x*4+3] >= 1)
2084 while (++x < endx && in4ub[x*4+3] >= 1) ;
2086 if (x >= endx) break;
2088 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089 if (x >= endx) break;
2096 case DPSOFTRAST_BLENDMODE_OPAQUE:
2097 case DPSOFTRAST_BLENDMODE_ADD:
2098 case DPSOFTRAST_BLENDMODE_INVMOD:
2099 case DPSOFTRAST_BLENDMODE_MUL:
2100 case DPSOFTRAST_BLENDMODE_MUL2:
2101 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102 case DPSOFTRAST_BLENDMODE_INVADD:
2105 // put some special values at the end of the mask to ensure the loops end
2106 pixelmask[endx] = 1;
2107 pixelmask[endx+1] = 0;
2108 // LordHavoc: use a double loop to identify subspans, this helps the
2109 // optimized copy/blend loops to perform at their best, most triangles
2110 // have only one run of pixels, and do the search using wide reads...
2114 // if this pixel is masked off, it's probably not alone...
2121 // the 4-item search must be aligned or else it stalls badly
2122 if ((x & 3) && !pixelmask[x])
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2132 if(pixelmask[x]) goto endmasked;
2137 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2141 for (;!pixelmask[x];x++)
2143 // rather than continue the loop, just check the end variable
2148 // find length of subspan
2151 if (subx + 8 < endx)
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2163 if(!pixelmask[subx]) goto endunmasked;
2168 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2172 for (;pixelmask[subx];subx++)
2174 // the checks can overshoot, so make sure to clip it...
2178 // now that we know the subspan length... process!
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2185 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2190 while (x + 16 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2200 while (x + 4 <= subx)
2202 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 pixeli[x+1] = ini[x+1];
2218 case DPSOFTRAST_BLENDMODE_ALPHA:
2219 #define FINISHBLEND(blend2, blend1) \
2220 for (;x + 1 < subx;x += 2) \
2223 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2226 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2231 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2234 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 case DPSOFTRAST_BLENDMODE_ADD:
2255 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2257 case DPSOFTRAST_BLENDMODE_INVMOD:
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264 case DPSOFTRAST_BLENDMODE_MUL:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2267 case DPSOFTRAST_BLENDMODE_MUL2:
2268 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2270 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2272 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288 case DPSOFTRAST_BLENDMODE_INVADD:
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300 static void DPSOFTRAST_Texture2D(DPSOFTRAST_Texture *texture, int mip, float x, float y, float c[4])
2301 // warning: this is SLOW, only use if the optimized per-span functions won't do
2302 // FIXME does this function need flipping of the color order?
2304 const unsigned char * RESTRICT pixelbase;
2305 const unsigned char * RESTRICT pixel[4];
2307 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2308 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2309 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2310 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2312 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2314 unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2315 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2316 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2317 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2318 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2319 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2320 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2321 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2322 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
2323 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
2324 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2325 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2326 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2327 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2328 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
2329 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
2330 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
2331 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
2335 unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2336 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2337 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2338 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2339 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2340 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2341 tci[0] &= tciwrapmask[0];
2342 tci[1] &= tciwrapmask[1];
2343 tci1[0] &= tciwrapmask[0];
2344 tci1[1] &= tciwrapmask[1];
2345 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2346 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2347 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2348 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2349 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
2350 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
2351 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
2352 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
2357 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2359 int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2360 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2361 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2362 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2363 c[0] = pixel[0][0] * (1.0f / 255.0f);
2364 c[1] = pixel[0][1] * (1.0f / 255.0f);
2365 c[2] = pixel[0][2] * (1.0f / 255.0f);
2366 c[3] = pixel[0][3] * (1.0f / 255.0f);
2370 int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2371 tci[0] &= tciwrapmask[0];
2372 tci[1] &= tciwrapmask[1];
2373 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2374 c[0] = pixel[0][0] * (1.0f / 255.0f);
2375 c[1] = pixel[0][1] * (1.0f / 255.0f);
2376 c[2] = pixel[0][2] * (1.0f / 255.0f);
2377 c[3] = pixel[0][3] * (1.0f / 255.0f);
2382 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2383 // warning: this is SLOW, only use if the optimized per-span functions won't do
2385 const unsigned char * RESTRICT pixelbase;
2386 const unsigned char * RESTRICT pixel[4];
2388 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2389 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2390 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2391 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2393 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2395 unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2396 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2397 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2398 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2399 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2400 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2401 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2402 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2403 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
2404 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
2405 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2406 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2407 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2408 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2409 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2410 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2411 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2412 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2416 unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2417 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2418 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2419 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2420 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2421 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2422 tci[0] &= tciwrapmask[0];
2423 tci[1] &= tciwrapmask[1];
2424 tci1[0] &= tciwrapmask[0];
2425 tci1[1] &= tciwrapmask[1];
2426 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2427 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2428 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2429 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2430 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2431 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2432 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2433 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2438 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2440 int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2441 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2442 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2443 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2451 int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2452 tci[0] &= tciwrapmask[0];
2453 tci[1] &= tciwrapmask[1];
2454 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2463 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2466 int startx = span->startx;
2467 int endx = span->endx;
2472 float tc[2], endtc[2];
2474 unsigned int tci[2];
2475 unsigned int tci1[2];
2476 unsigned int tcimin[2];
2477 unsigned int tcimax[2];
2482 const unsigned char * RESTRICT pixelbase;
2483 const unsigned char * RESTRICT pixel[4];
2484 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2485 // if no texture is bound, just fill it with white
2488 for (x = startx;x < endx;x++)
2490 out4f[x*4+0] = 1.0f;
2491 out4f[x*4+1] = 1.0f;
2492 out4f[x*4+2] = 1.0f;
2493 out4f[x*4+3] = 1.0f;
2497 mip = triangle->mip[texunitindex];
2498 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499 // if this mipmap of the texture is 1 pixel, just fill it with that color
2500 if (texture->mipmap[mip][1] == 4)
2502 c[0] = texture->bytes[2] * (1.0f/255.0f);
2503 c[1] = texture->bytes[1] * (1.0f/255.0f);
2504 c[2] = texture->bytes[0] * (1.0f/255.0f);
2505 c[3] = texture->bytes[3] * (1.0f/255.0f);
2506 for (x = startx;x < endx;x++)
2508 out4f[x*4+0] = c[0];
2509 out4f[x*4+1] = c[1];
2510 out4f[x*4+2] = c[2];
2511 out4f[x*4+3] = c[3];
2515 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2517 flags = texture->flags;
2518 tcscale[0] = texture->mipmap[mip][2];
2519 tcscale[1] = texture->mipmap[mip][3];
2520 tciwidth = texture->mipmap[mip][2];
2523 tcimax[0] = texture->mipmap[mip][2]-1;
2524 tcimax[1] = texture->mipmap[mip][3]-1;
2525 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2526 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2527 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2528 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2534 for (x = startx;x < endx;)
2536 unsigned int subtc[2];
2537 unsigned int substep[2];
2538 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2539 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2540 if (nextsub >= endx)
2542 nextsub = endsub = endx-1;
2543 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2547 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2548 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2554 substep[0] = (endtc[0] - tc[0]) * subscale;
2555 substep[1] = (endtc[1] - tc[1]) * subscale;
2556 subtc[0] = tc[0] * (1<<12);
2557 subtc[1] = tc[1] * (1<<12);
2560 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2562 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2564 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2565 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2566 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2567 tci[0] = subtc[0]>>12;
2568 tci[1] = subtc[1]>>12;
2569 tci1[0] = tci[0] + 1;
2570 tci1[1] = tci[1] + 1;
2571 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2572 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2573 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2574 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2575 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2576 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2577 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2578 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2579 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2580 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2581 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2582 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2583 out4f[x*4+0] = c[0];
2584 out4f[x*4+1] = c[1];
2585 out4f[x*4+2] = c[2];
2586 out4f[x*4+3] = c[3];
2591 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2593 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2594 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2595 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2596 tci[0] = subtc[0]>>12;
2597 tci[1] = subtc[1]>>12;
2598 tci1[0] = tci[0] + 1;
2599 tci1[1] = tci[1] + 1;
2600 tci[0] &= tciwrapmask[0];
2601 tci[1] &= tciwrapmask[1];
2602 tci1[0] &= tciwrapmask[0];
2603 tci1[1] &= tciwrapmask[1];
2604 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2605 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2606 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2607 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2608 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2609 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2610 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2611 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2612 out4f[x*4+0] = c[0];
2613 out4f[x*4+1] = c[1];
2614 out4f[x*4+2] = c[2];
2615 out4f[x*4+3] = c[3];
2619 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2621 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2623 tci[0] = subtc[0]>>12;
2624 tci[1] = subtc[1]>>12;
2625 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2626 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2627 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2628 c[0] = pixel[0][2] * (1.0f / 255.0f);
2629 c[1] = pixel[0][1] * (1.0f / 255.0f);
2630 c[2] = pixel[0][0] * (1.0f / 255.0f);
2631 c[3] = pixel[0][3] * (1.0f / 255.0f);
2632 out4f[x*4+0] = c[0];
2633 out4f[x*4+1] = c[1];
2634 out4f[x*4+2] = c[2];
2635 out4f[x*4+3] = c[3];
2640 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2642 tci[0] = subtc[0]>>12;
2643 tci[1] = subtc[1]>>12;
2644 tci[0] &= tciwrapmask[0];
2645 tci[1] &= tciwrapmask[1];
2646 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2647 c[0] = pixel[0][2] * (1.0f / 255.0f);
2648 c[1] = pixel[0][1] * (1.0f / 255.0f);
2649 c[2] = pixel[0][0] * (1.0f / 255.0f);
2650 c[3] = pixel[0][3] * (1.0f / 255.0f);
2651 out4f[x*4+0] = c[0];
2652 out4f[x*4+1] = c[1];
2653 out4f[x*4+2] = c[2];
2654 out4f[x*4+3] = c[3];
2660 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2664 int startx = span->startx;
2665 int endx = span->endx;
2667 __m128 data, slope, tcscale;
2668 __m128i tcsize, tcmask, tcoffset, tcmax;
2670 __m128i subtc, substep, endsubtc;
2673 int affine; // LordHavoc: optimized affine texturing case
2674 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2675 const unsigned char * RESTRICT pixelbase;
2676 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2677 // if no texture is bound, just fill it with white
2680 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2683 mip = triangle->mip[texunitindex];
2684 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2685 // if this mipmap of the texture is 1 pixel, just fill it with that color
2686 if (texture->mipmap[mip][1] == 4)
2688 unsigned int k = *((const unsigned int *)pixelbase);
2689 for (x = startx;x < endx;x++)
2693 affine = zf[startx] == zf[endx-1];
2694 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2695 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2696 flags = texture->flags;
2697 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2698 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2699 tcscale = _mm_cvtepi32_ps(tcsize);
2700 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2701 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2702 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2704 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2705 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2706 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2707 tcmax = _mm_packs_epi32(tcmask, tcmask);
2708 for (x = startx;x < endx;)
2710 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2711 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2712 if (nextsub >= endx || affine)
2714 nextsub = endsub = endx-1;
2715 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2719 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2721 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2722 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2723 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2724 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2725 substep = _mm_slli_epi32(substep, 1);
2728 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2729 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2731 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2732 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2734 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2735 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2736 tci = _mm_madd_epi16(tci, tcoffset);
2737 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2738 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2739 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2740 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2741 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2742 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2743 fracm = _mm_srli_epi16(subtc, 1);
2744 pix1 = _mm_add_epi16(pix1,
2745 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2746 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2747 pix3 = _mm_add_epi16(pix3,
2748 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2749 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2750 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2751 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2752 pix2 = _mm_add_epi16(pix2,
2753 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2754 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2755 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2759 const unsigned char * RESTRICT ptr1;
2760 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2761 tci = _mm_madd_epi16(tci, tcoffset);
2762 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2763 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2764 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2765 fracm = _mm_srli_epi16(subtc, 1);
2766 pix1 = _mm_add_epi16(pix1,
2767 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2768 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2769 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2770 pix1 = _mm_add_epi16(pix1,
2771 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2773 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2777 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2779 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2781 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2782 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2783 tci = _mm_madd_epi16(tci, tcoffset);
2784 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2785 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2786 _mm_setzero_si128());
2787 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2788 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2789 _mm_setzero_si128());
2790 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2791 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2792 tci = _mm_madd_epi16(tci, tcoffset);
2793 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2794 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2795 _mm_setzero_si128());
2796 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2797 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2798 _mm_setzero_si128());
2799 fracm = _mm_srli_epi16(subtc, 1);
2800 pix1 = _mm_add_epi16(pix1,
2801 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2802 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2803 pix3 = _mm_add_epi16(pix3,
2804 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2805 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2806 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2807 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2808 pix2 = _mm_add_epi16(pix2,
2809 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2810 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2811 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2815 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2816 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2817 tci = _mm_madd_epi16(tci, tcoffset);
2818 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2819 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2820 _mm_setzero_si128());
2821 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2822 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2823 _mm_setzero_si128());
2824 fracm = _mm_srli_epi16(subtc, 1);
2825 pix1 = _mm_add_epi16(pix1,
2826 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2827 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2828 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2829 pix1 = _mm_add_epi16(pix1,
2830 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2831 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2832 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2838 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2840 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2841 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2842 tci = _mm_madd_epi16(tci, tcoffset);
2843 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2844 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2845 _mm_setzero_si128());
2846 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2847 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2848 _mm_setzero_si128());
2849 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2850 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2851 tci = _mm_madd_epi16(tci, tcoffset);
2852 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2853 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2854 _mm_setzero_si128());
2855 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2856 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2857 _mm_setzero_si128());
2858 fracm = _mm_srli_epi16(subtc, 1);
2859 pix1 = _mm_add_epi16(pix1,
2860 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2861 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2862 pix3 = _mm_add_epi16(pix3,
2863 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2864 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2865 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2866 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2867 pix2 = _mm_add_epi16(pix2,
2868 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2869 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2870 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2874 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2875 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2876 tci = _mm_madd_epi16(tci, tcoffset);
2877 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2878 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2879 _mm_setzero_si128());
2880 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2881 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2882 _mm_setzero_si128());
2883 fracm = _mm_srli_epi16(subtc, 1);
2884 pix1 = _mm_add_epi16(pix1,
2885 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2886 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2887 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2888 pix1 = _mm_add_epi16(pix1,
2889 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2890 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2891 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2898 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2900 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2902 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2903 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2904 tci = _mm_madd_epi16(tci, tcoffset);
2905 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2906 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2910 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2911 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2912 tci = _mm_madd_epi16(tci, tcoffset);
2913 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2919 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2921 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2922 tci = _mm_and_si128(tci, tcmax);
2923 tci = _mm_madd_epi16(tci, tcoffset);
2924 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2925 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2929 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2930 tci = _mm_and_si128(tci, tcmax);
2931 tci = _mm_madd_epi16(tci, tcoffset);
2932 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2941 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2944 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2947 float DPSOFTRAST_SampleShadowmap(const float *vector)
2953 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2956 int startx = span->startx;
2957 int endx = span->endx;
2962 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2963 for (x = startx;x < endx;x++)
2966 c[0] = (data[0] + slope[0]*x) * z;
2967 c[1] = (data[1] + slope[1]*x) * z;
2968 c[2] = (data[2] + slope[2]*x) * z;
2969 c[3] = (data[3] + slope[3]*x) * z;
2970 out4f[x*4+0] = in4f[x*4+0] * c[0];
2971 out4f[x*4+1] = in4f[x*4+1] * c[1];
2972 out4f[x*4+2] = in4f[x*4+2] * c[2];
2973 out4f[x*4+3] = in4f[x*4+3] * c[3];
2977 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2980 int startx = span->startx;
2981 int endx = span->endx;
2986 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2987 for (x = startx;x < endx;x++)
2990 c[0] = (data[0] + slope[0]*x) * z;
2991 c[1] = (data[1] + slope[1]*x) * z;
2992 c[2] = (data[2] + slope[2]*x) * z;
2993 c[3] = (data[3] + slope[3]*x) * z;
2994 out4f[x*4+0] = c[0];
2995 out4f[x*4+1] = c[1];
2996 out4f[x*4+2] = c[2];
2997 out4f[x*4+3] = c[3];
3001 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
3003 int x, startx = span->startx, endx = span->endx;
3004 float c[4], localcolor[4];
3005 localcolor[0] = subcolor[0];
3006 localcolor[1] = subcolor[1];
3007 localcolor[2] = subcolor[2];
3008 localcolor[3] = subcolor[3];
3009 for (x = startx;x < endx;x++)
3011 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
3012 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
3013 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
3014 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
3015 out4f[x*4+0] = ina4f[x*4+0] + c[0];
3016 out4f[x*4+1] = ina4f[x*4+1] + c[1];
3017 out4f[x*4+2] = ina4f[x*4+2] + c[2];
3018 out4f[x*4+3] = ina4f[x*4+3] + c[3];
3022 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3024 int x, startx = span->startx, endx = span->endx;
3025 for (x = startx;x < endx;x++)
3027 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
3028 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
3029 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
3030 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
3034 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3036 int x, startx = span->startx, endx = span->endx;
3037 for (x = startx;x < endx;x++)
3039 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
3040 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
3041 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
3042 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3046 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3048 int x, startx = span->startx, endx = span->endx;
3050 for (x = startx;x < endx;x++)
3052 a = 1.0f - inb4f[x*4+3];
3054 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3055 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3056 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3057 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3061 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3063 int x, startx = span->startx, endx = span->endx;
3064 float localcolor[4], ilerp, lerp;
3065 localcolor[0] = color[0];
3066 localcolor[1] = color[1];
3067 localcolor[2] = color[2];
3068 localcolor[3] = color[3];
3069 ilerp = 1.0f - localcolor[3];
3070 lerp = localcolor[3];
3071 for (x = startx;x < endx;x++)
3073 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3074 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3075 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3076 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3082 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3086 int startx = span->startx;
3087 int endx = span->endx;
3090 __m128i submod, substep, endsubmod;
3091 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3092 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3093 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3094 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3095 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3096 for (x = startx; x < endx;)
3098 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3099 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3100 if (nextsub >= endx)
3102 nextsub = endsub = endx-1;
3103 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3107 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3108 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3109 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3110 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3111 substep = _mm_packs_epi32(substep, substep);
3112 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3114 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3115 pix = _mm_mulhi_epu16(pix, submod);
3116 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3120 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3121 pix = _mm_mulhi_epu16(pix, submod);
3122 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3129 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3133 int startx = span->startx;
3134 int endx = span->endx;
3137 __m128i submod, substep, endsubmod;
3138 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3139 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3140 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3141 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3142 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3143 for (x = startx; x < endx;)
3145 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3146 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3147 if (nextsub >= endx)
3149 nextsub = endsub = endx-1;
3150 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3154 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3155 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3156 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3157 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3158 substep = _mm_packs_epi32(substep, substep);
3159 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3161 __m128i pix = _mm_srai_epi16(submod, 4);
3162 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3166 __m128i pix = _mm_srai_epi16(submod, 4);
3167 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3174 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3177 int x, startx = span->startx, endx = span->endx;
3178 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3179 localcolor = _mm_packs_epi32(localcolor, localcolor);
3180 for (x = startx;x+2 <= endx;x+=2)
3182 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3183 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3184 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3185 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3189 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3190 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3191 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3192 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3197 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3200 int x, startx = span->startx, endx = span->endx;
3201 for (x = startx;x+2 <= endx;x+=2)
3203 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3204 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3205 pix1 = _mm_mulhi_epu16(pix1, pix2);
3206 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3210 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3212 pix1 = _mm_mulhi_epu16(pix1, pix2);
3213 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3218 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3221 int x, startx = span->startx, endx = span->endx;
3222 for (x = startx;x+2 <= endx;x+=2)
3224 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3225 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3226 pix1 = _mm_add_epi16(pix1, pix2);
3227 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3231 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3232 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3233 pix1 = _mm_add_epi16(pix1, pix2);
3234 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3239 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3242 int x, startx = span->startx, endx = span->endx;
3243 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3244 tint = _mm_packs_epi32(tint, tint);
3245 for (x = startx;x+2 <= endx;x+=2)
3247 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3248 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3249 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3250 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3254 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3255 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3256 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3257 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3262 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3265 int x, startx = span->startx, endx = span->endx;
3266 for (x = startx;x+2 <= endx;x+=2)
3268 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3269 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3271 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3272 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3276 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3277 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3278 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3279 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3280 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3285 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3288 int x, startx = span->startx, endx = span->endx;
3289 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3290 localcolor = _mm_packs_epi32(localcolor, localcolor);
3291 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3292 for (x = startx;x+2 <= endx;x+=2)
3294 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3295 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3296 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3300 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3301 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3302 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3309 void DPSOFTRAST_VertexShader_Generic(void)
3311 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3312 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3313 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3314 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3315 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3318 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3320 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3321 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3322 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3323 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3324 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3325 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3327 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3328 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3329 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3331 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3332 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3335 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3337 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3340 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3342 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3345 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3350 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3351 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3356 void DPSOFTRAST_VertexShader_PostProcess(void)
3358 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3359 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3360 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3363 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3365 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3366 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3367 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3368 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3369 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3370 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3371 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3373 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3374 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3376 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3377 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3379 // TODO: implement saturation
3381 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3383 // TODO: implement gammaramps
3385 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3390 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3392 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3395 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3397 // this is never called (because colormask is off when this shader is used)
3398 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3399 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3400 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3401 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3402 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3407 void DPSOFTRAST_VertexShader_FlatColor(void)
3409 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3410 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3413 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3416 unsigned char * RESTRICT pixelmask = span->pixelmask;
3417 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3418 int x, startx = span->startx, endx = span->endx;
3419 __m128i Color_Ambientm;
3420 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3421 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3422 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3423 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3424 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3425 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3426 pixel = buffer_FragColorbgra8;
3427 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3428 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3429 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3430 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3431 for (x = startx;x < endx;x++)
3434 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3437 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3438 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3439 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3440 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3446 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3447 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3448 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3450 if (pixel == buffer_FragColorbgra8)
3451 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3457 void DPSOFTRAST_VertexShader_VertexColor(void)
3459 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3460 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3461 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3464 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3467 unsigned char * RESTRICT pixelmask = span->pixelmask;
3468 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3469 int x, startx = span->startx, endx = span->endx;
3470 __m128i Color_Ambientm, Color_Diffusem;
3472 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3473 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3474 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3475 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3476 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3477 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3478 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3479 pixel = buffer_FragColorbgra8;
3480 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3483 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3484 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3485 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3486 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3487 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3488 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3489 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3490 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3491 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3492 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3493 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3495 __m128i color, mod, pix;
3496 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3499 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3500 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3501 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3502 data = _mm_add_ps(data, slope);
3503 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3504 data = _mm_add_ps(data, slope);
3505 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3506 data = _mm_add_ps(data, slope);
3507 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3508 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3509 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3510 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3511 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3512 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3518 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3519 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3520 mod = _mm_packs_epi32(mod, mod);
3521 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3522 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3524 if (pixel == buffer_FragColorbgra8)
3525 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3531 void DPSOFTRAST_VertexShader_Lightmap(void)
3533 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3534 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3535 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3538 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3541 unsigned char * RESTRICT pixelmask = span->pixelmask;
3542 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3543 int x, startx = span->startx, endx = span->endx;
3544 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3545 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3546 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3551 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3552 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3553 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3554 pixel = buffer_FragColorbgra8;
3555 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3556 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3557 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3558 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3559 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3560 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3561 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3562 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3564 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3565 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3566 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3567 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3568 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3569 for (x = startx;x < endx;x++)
3571 __m128i color, lightmap, glow, pix;
3572 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3575 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3576 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3577 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3578 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3579 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3580 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3581 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3582 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3583 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3584 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3590 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3591 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3592 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3593 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3594 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3595 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3600 for (x = startx;x < endx;x++)
3602 __m128i color, lightmap, pix;
3603 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3606 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3607 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3608 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3609 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3610 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3611 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3612 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3618 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3619 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3620 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3621 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3624 if (pixel == buffer_FragColorbgra8)
3625 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3630 void DPSOFTRAST_VertexShader_LightDirection(void);
3631 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3633 void DPSOFTRAST_VertexShader_FakeLight(void)
3635 DPSOFTRAST_VertexShader_LightDirection();
3638 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3640 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3645 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3647 DPSOFTRAST_VertexShader_LightDirection();
3648 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3651 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3653 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3658 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3660 DPSOFTRAST_VertexShader_LightDirection();
3661 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3664 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3666 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3671 void DPSOFTRAST_VertexShader_LightDirection(void)
3674 int numvertices = dpsoftrast.numvertices;
3676 float LightVector[4];
3677 float EyePosition[4];
3678 float EyeVectorModelSpace[4];
3684 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3685 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3686 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3687 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3688 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3689 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3690 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3691 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3692 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3693 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3694 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3695 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3696 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3697 for (i = 0;i < numvertices;i++)
3699 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3700 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3701 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3702 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3703 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3704 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3705 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3706 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3707 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3708 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3709 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3710 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3711 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3712 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3713 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3714 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3715 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3716 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3717 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3718 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3719 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3720 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3721 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3722 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3723 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3724 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3725 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3726 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3727 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3729 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3732 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3733 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3734 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3735 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3736 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3737 #define DPSOFTRAST_Vector3Normalize(v)\
3740 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3751 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3753 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3754 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3755 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3756 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3757 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3758 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3759 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3760 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3761 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3762 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3763 int x, startx = span->startx, endx = span->endx;
3764 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3765 float LightVectordata[4];
3766 float LightVectorslope[4];
3767 float EyeVectordata[4];
3768 float EyeVectorslope[4];
3769 float VectorSdata[4];
3770 float VectorSslope[4];
3771 float VectorTdata[4];
3772 float VectorTslope[4];
3773 float VectorRdata[4];
3774 float VectorRslope[4];
3776 float diffusetex[4];
3778 float surfacenormal[4];
3779 float lightnormal[4];
3780 float lightnormal_modelspace[4];
3782 float specularnormal[4];
3785 float SpecularPower;
3787 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3788 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3789 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3790 Color_Glow[3] = 0.0f;
3791 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3792 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3793 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3794 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3795 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3796 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3797 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3798 Color_Pants[3] = 0.0f;
3799 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3800 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3801 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3802 Color_Shirt[3] = 0.0f;
3803 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3804 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3805 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3807 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3808 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3810 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3812 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3814 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3816 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3817 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3818 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3819 Color_Diffuse[3] = 0.0f;
3820 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3821 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3822 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3823 LightColor[3] = 0.0f;
3824 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3825 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3826 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3827 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3828 Color_Specular[3] = 0.0f;
3829 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3830 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3831 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3833 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3835 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3836 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3837 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3838 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3839 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3841 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3843 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3844 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3846 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3848 // nothing of this needed
3852 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3855 for (x = startx;x < endx;x++)
3858 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3859 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3860 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3861 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3862 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3864 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3865 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3866 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3867 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3869 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3870 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3871 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3872 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3873 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3874 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3875 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3876 DPSOFTRAST_Vector3Normalize(surfacenormal);
3878 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3880 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3881 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3882 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3883 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3885 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3886 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3887 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3888 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3890 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3891 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3892 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3893 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3895 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3896 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3897 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3898 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3900 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3901 DPSOFTRAST_Vector3Normalize(lightnormal);
3903 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3905 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3906 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3907 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3908 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3911 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3913 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3914 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3915 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3917 float f = 1.0f / 256.0f;
3918 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3919 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3920 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3923 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3925 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3926 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3927 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3928 DPSOFTRAST_Vector3Normalize(lightnormal);
3930 LightColor[0] = 1.0;
3931 LightColor[1] = 1.0;
3932 LightColor[2] = 1.0;
3936 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3937 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3938 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3939 DPSOFTRAST_Vector3Normalize(lightnormal);
3942 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3944 if(thread->shader_exactspecularmath)
3946 // reflect lightnormal at surfacenormal, take the negative of that
3947 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3949 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3950 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3951 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3952 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3954 // dot of this and normalize(EyeVectorFogDepth.xyz)
3955 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3956 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3957 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3958 DPSOFTRAST_Vector3Normalize(eyenormal);
3960 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3964 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3965 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3966 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3967 DPSOFTRAST_Vector3Normalize(eyenormal);
3969 specularnormal[0] = lightnormal[0] + eyenormal[0];
3970 specularnormal[1] = lightnormal[1] + eyenormal[1];
3971 specularnormal[2] = lightnormal[2] + eyenormal[2];
3972 DPSOFTRAST_Vector3Normalize(specularnormal);
3974 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3977 specular = pow(specular, SpecularPower * glosstex[3]);
3978 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3980 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3981 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3982 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3983 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3987 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3988 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3989 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3990 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3993 buffer_FragColorbgra8[x*4+0] = d[0];
3994 buffer_FragColorbgra8[x*4+1] = d[1];
3995 buffer_FragColorbgra8[x*4+2] = d[2];
3996 buffer_FragColorbgra8[x*4+3] = d[3];
3999 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4001 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4002 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4003 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4004 Color_Diffuse[3] = 0.0f;
4005 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4006 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4007 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4008 LightColor[3] = 0.0f;
4009 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4011 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4013 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4014 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4015 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4016 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4017 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4019 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4021 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4022 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4024 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4026 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4030 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
4033 for (x = startx;x < endx;x++)
4036 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4037 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4038 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4039 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4040 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4041 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4042 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4043 DPSOFTRAST_Vector3Normalize(surfacenormal);
4045 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4047 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4048 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4049 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4050 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4052 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4053 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4054 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4055 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4057 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4058 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4059 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4060 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4062 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4063 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4064 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4065 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4067 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4068 DPSOFTRAST_Vector3Normalize(lightnormal);
4070 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4072 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4073 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4074 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4075 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4078 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4080 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4081 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4082 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4084 float f = 1.0f / 256.0f;
4085 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4086 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4087 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4090 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4092 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4093 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4094 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4095 DPSOFTRAST_Vector3Normalize(lightnormal);
4097 LightColor[0] = 1.0;
4098 LightColor[1] = 1.0;
4099 LightColor[2] = 1.0;
4103 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4104 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4105 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4106 DPSOFTRAST_Vector3Normalize(lightnormal);
4109 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4110 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4112 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4113 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4114 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4115 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4119 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4120 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4121 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4122 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4124 buffer_FragColorbgra8[x*4+0] = d[0];
4125 buffer_FragColorbgra8[x*4+1] = d[1];
4126 buffer_FragColorbgra8[x*4+2] = d[2];
4127 buffer_FragColorbgra8[x*4+3] = d[3];
4132 for (x = startx;x < endx;x++)
4135 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4136 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4137 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4138 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4140 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4142 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4143 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4144 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4145 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4149 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4150 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4151 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4152 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4154 buffer_FragColorbgra8[x*4+0] = d[0];
4155 buffer_FragColorbgra8[x*4+1] = d[1];
4156 buffer_FragColorbgra8[x*4+2] = d[2];
4157 buffer_FragColorbgra8[x*4+3] = d[3];
4160 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4165 void DPSOFTRAST_VertexShader_LightSource(void)
4168 int numvertices = dpsoftrast.numvertices;
4169 float LightPosition[4];
4170 float LightVector[4];
4171 float LightVectorModelSpace[4];
4172 float EyePosition[4];
4173 float EyeVectorModelSpace[4];
4179 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4180 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4181 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4182 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4183 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4184 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4185 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4186 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4187 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4188 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4189 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4190 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4191 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4192 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4193 for (i = 0;i < numvertices;i++)
4195 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4196 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4197 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4198 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4199 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4200 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4201 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4202 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4203 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4204 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4205 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4206 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4207 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4208 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4209 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4210 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4211 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4212 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4213 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4214 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4215 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4216 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4217 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4218 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4219 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4220 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4221 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4222 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4223 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4224 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4225 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4226 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4228 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4229 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4232 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4235 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4236 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4237 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4238 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4239 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4240 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4241 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4242 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4243 int x, startx = span->startx, endx = span->endx;
4244 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4245 float CubeVectordata[4];
4246 float CubeVectorslope[4];
4247 float LightVectordata[4];
4248 float LightVectorslope[4];
4249 float EyeVectordata[4];
4250 float EyeVectorslope[4];
4252 float diffusetex[4];
4254 float surfacenormal[4];
4255 float lightnormal[4];
4257 float specularnormal[4];
4260 float SpecularPower;
4261 float CubeVector[4];
4264 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4265 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4266 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4267 Color_Glow[3] = 0.0f;
4268 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4269 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4270 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4271 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4272 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4273 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4274 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4275 Color_Diffuse[3] = 0.0f;
4276 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4277 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4278 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4279 Color_Specular[3] = 0.0f;
4280 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4281 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4282 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4283 Color_Pants[3] = 0.0f;
4284 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4285 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4286 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4287 Color_Shirt[3] = 0.0f;
4288 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4289 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4290 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4291 LightColor[3] = 0.0f;
4292 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4293 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4294 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4295 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4296 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4297 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4298 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4299 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4301 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4302 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4304 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4305 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4306 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4308 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4309 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4310 for (x = startx;x < endx;x++)
4313 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4314 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4315 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4316 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4317 if (attenuation < 0.01f)
4319 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4321 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4322 if (attenuation < 0.01f)
4326 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4327 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4328 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4329 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4330 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4332 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4333 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4334 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4335 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4337 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4338 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4339 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4340 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4341 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4342 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4343 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4344 DPSOFTRAST_Vector3Normalize(surfacenormal);
4346 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4347 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4348 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4349 DPSOFTRAST_Vector3Normalize(lightnormal);
4351 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4353 if(thread->shader_exactspecularmath)
4355 // reflect lightnormal at surfacenormal, take the negative of that
4356 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4358 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4359 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4360 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4361 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4363 // dot of this and normalize(EyeVectorFogDepth.xyz)
4364 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4365 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4366 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4367 DPSOFTRAST_Vector3Normalize(eyenormal);
4369 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4373 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4374 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4375 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4376 DPSOFTRAST_Vector3Normalize(eyenormal);
4378 specularnormal[0] = lightnormal[0] + eyenormal[0];
4379 specularnormal[1] = lightnormal[1] + eyenormal[1];
4380 specularnormal[2] = lightnormal[2] + eyenormal[2];
4381 DPSOFTRAST_Vector3Normalize(specularnormal);
4383 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4385 specular = pow(specular, SpecularPower * glosstex[3]);
4387 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4389 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4390 attenuation *= (1.0f / 255.0f);
4391 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4392 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4393 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4394 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4398 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4399 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4400 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4401 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4403 buffer_FragColorbgra8[x*4+0] = d[0];
4404 buffer_FragColorbgra8[x*4+1] = d[1];
4405 buffer_FragColorbgra8[x*4+2] = d[2];
4406 buffer_FragColorbgra8[x*4+3] = d[3];
4409 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4411 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4412 for (x = startx;x < endx;x++)
4415 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4416 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4417 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4418 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4419 if (attenuation < 0.01f)
4421 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4423 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4424 if (attenuation < 0.01f)
4428 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4429 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4430 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4431 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4432 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4434 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4435 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4436 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4437 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4439 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4440 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4441 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4442 DPSOFTRAST_Vector3Normalize(surfacenormal);
4444 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4445 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4446 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4447 DPSOFTRAST_Vector3Normalize(lightnormal);
4449 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4450 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4452 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4453 attenuation *= (1.0f / 255.0f);
4454 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4455 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4456 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4457 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4461 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4462 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4463 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4464 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4466 buffer_FragColorbgra8[x*4+0] = d[0];
4467 buffer_FragColorbgra8[x*4+1] = d[1];
4468 buffer_FragColorbgra8[x*4+2] = d[2];
4469 buffer_FragColorbgra8[x*4+3] = d[3];
4474 for (x = startx;x < endx;x++)
4477 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4478 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4479 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4480 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4481 if (attenuation < 0.01f)
4483 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4485 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4486 if (attenuation < 0.01f)
4490 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4491 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4492 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4493 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4494 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4496 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4497 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4498 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4499 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4501 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4503 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4504 attenuation *= (1.0f / 255.0f);
4505 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4506 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4507 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4508 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4512 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4513 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4514 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4515 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4517 buffer_FragColorbgra8[x*4+0] = d[0];
4518 buffer_FragColorbgra8[x*4+1] = d[1];
4519 buffer_FragColorbgra8[x*4+2] = d[2];
4520 buffer_FragColorbgra8[x*4+3] = d[3];
4523 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4529 void DPSOFTRAST_VertexShader_Refraction(void)
4531 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4532 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4533 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4536 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4538 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4540 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4542 int x, startx = span->startx, endx = span->endx;
4545 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4546 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4549 float ModelViewProjectionPositiondata[4];
4550 float ModelViewProjectionPositionslope[4];
4553 float ScreenScaleRefractReflect[2];
4554 float ScreenCenterRefractReflect[2];
4555 float DistortScaleRefractReflect[2];
4556 float RefractColor[4];
4558 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4559 if(!texture) return;
4562 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4563 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4566 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4569 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4570 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4571 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4572 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4573 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4574 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4575 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4576 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4577 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4578 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4581 for (x = startx;x < endx;x++)
4583 float SafeScreenTexCoord[2];
4584 float ScreenTexCoord[2];
4591 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4592 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4594 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4595 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4596 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4598 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4599 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4600 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4601 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4602 DPSOFTRAST_Vector3Normalize(v);
4603 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4604 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4606 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4607 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4609 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4610 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4611 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4612 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4613 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4616 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4621 void DPSOFTRAST_VertexShader_Water(void)
4623 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4627 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4630 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4631 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4632 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4633 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4634 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4639 void DPSOFTRAST_VertexShader_ShowDepth(void)
4641 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4644 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4647 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4648 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4649 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4650 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4651 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4656 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4658 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4661 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4664 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4665 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4666 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4667 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4668 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4673 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4675 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4678 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4681 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4682 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4683 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4684 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4685 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4690 typedef struct DPSOFTRAST_ShaderModeInfo_s
4693 void (*Vertex)(void);
4694 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4695 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4696 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4698 DPSOFTRAST_ShaderModeInfo;
4700 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4702 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4703 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4704 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4705 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4706 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4707 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4708 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4709 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4710 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4711 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4712 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4713 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4714 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4715 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4716 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4717 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4720 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4725 unsigned int *depthpixel;
4729 unsigned char *pixelmask;
4730 DPSOFTRAST_State_Triangle *triangle;
4731 triangle = &thread->triangles[span->triangle];
4732 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4733 startx = span->startx;
4735 depth = span->depthbase;
4736 depthslope = span->depthslope;
4737 pixelmask = thread->pixelmaskarray;
4738 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4740 switch(thread->fb_depthfunc)
4743 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4744 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4745 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4746 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4747 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4748 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4749 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4751 while (startx < endx && !pixelmask[startx])
4753 while (endx > startx && !pixelmask[endx-1])
4758 // no depth testing means we're just dealing with color...
4759 memset(pixelmask + startx, 1, endx - startx);
4761 span->pixelmask = pixelmask;
4762 span->startx = startx;
4766 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4768 int x, d, depth, depthslope, startx, endx;
4769 const unsigned char *pixelmask;
4770 unsigned int *depthpixel;
4771 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4773 depth = span->depthbase;
4774 depthslope = span->depthslope;
4775 pixelmask = span->pixelmask;
4776 startx = span->startx;
4778 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4779 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4785 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4788 DPSOFTRAST_State_Triangle *triangle;
4789 DPSOFTRAST_State_Span *span;
4790 for (i = 0; i < thread->numspans; i++)
4792 span = &thread->spans[i];
4793 triangle = &thread->triangles[span->triangle];
4794 DPSOFTRAST_Draw_DepthTest(thread, span);
4795 if (span->startx >= span->endx)
4797 // run pixel shader if appropriate
4798 // do this before running depthmask code, to allow the pixelshader
4799 // to clear pixelmask values for alpha testing
4800 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4801 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4802 DPSOFTRAST_Draw_DepthWrite(thread, span);
4804 thread->numspans = 0;
4807 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4809 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4812 int cullface = thread->cullface;
4813 int minx, maxx, miny, maxy;
4814 int miny1, maxy1, miny2, maxy2;
4815 __m128i fbmin, fbmax;
4816 __m128 viewportcenter, viewportscale;
4817 int firstvertex = command->firstvertex;
4818 int numvertices = command->numvertices;
4819 int numtriangles = command->numtriangles;
4820 const int *element3i = command->element3i;
4821 const unsigned short *element3s = command->element3s;
4822 int clipped = command->clipped;
4829 int starty, endy, bandy;
4833 float clip0origin, clip0slope;
4835 __m128 triangleedge1, triangleedge2, trianglenormal;
4838 DPSOFTRAST_State_Triangle *triangle;
4839 DPSOFTRAST_Texture *texture;
4840 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4841 miny = thread->fb_scissor[1];
4842 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4843 miny1 = bound(miny, thread->miny1, maxy);
4844 maxy1 = bound(miny, thread->maxy1, maxy);
4845 miny2 = bound(miny, thread->miny2, maxy);
4846 maxy2 = bound(miny, thread->maxy2, maxy);
4847 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4849 if (!ATOMIC_DECREMENT(command->refcount))
4851 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4852 MM_FREE(command->arrays);
4856 minx = thread->fb_scissor[0];
4857 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4858 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4859 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4860 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4861 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4862 screen[3] = _mm_setzero_ps();
4863 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4864 for (i = 0;i < numtriangles;i++)
4866 const float *screencoord4f = command->arrays;
4867 const float *arrays = screencoord4f + numvertices*4;
4869 // generate the 3 edges of this triangle
4870 // generate spans for the triangle - switch based on left split or right split classification of triangle
4873 e[0] = element3s[i*3+0] - firstvertex;
4874 e[1] = element3s[i*3+1] - firstvertex;
4875 e[2] = element3s[i*3+2] - firstvertex;
4879 e[0] = element3i[i*3+0] - firstvertex;
4880 e[1] = element3i[i*3+1] - firstvertex;
4881 e[2] = element3i[i*3+2] - firstvertex;
4890 #define SKIPBACKFACE \
4891 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4892 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4893 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4894 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4895 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4899 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4903 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4908 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4909 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4911 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4912 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4914 #define CLIPPEDVERTEXCOPY(k,p1) \
4915 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4917 #define GENATTRIBCOPY(attrib, p1) \
4918 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4919 #define GENATTRIBLERP(attrib, p1, p2) \
4921 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4922 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4924 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4928 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4929 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4930 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4931 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4932 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4933 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4934 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4940 // calculate distance from nearplane
4941 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4942 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4943 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4944 if (clipdist[0] >= 0.0f)
4946 if (clipdist[1] >= 0.0f)
4948 if (clipdist[2] >= 0.0f)
4951 // triangle is entirely in front of nearplane
4952 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4959 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4967 if (clipdist[2] >= 0.0f)
4969 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4976 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4983 else if (clipdist[1] >= 0.0f)
4985 if (clipdist[2] >= 0.0f)
4987 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4994 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5000 else if (clipdist[2] >= 0.0f)
5002 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5007 else continue; // triangle is entirely behind nearplane
5010 // calculate integer y coords for triangle points
5011 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5012 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5013 screenmin = _mm_min_epi16(screeni, screenir),
5014 screenmax = _mm_max_epi16(screeni, screenir);
5015 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5016 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5017 screenmin = _mm_max_epi16(screenmin, fbmin);
5018 screenmax = _mm_min_epi16(screenmax, fbmax);
5019 // skip offscreen triangles
5020 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5022 starty = _mm_extract_epi16(screenmin, 1);
5023 endy = _mm_extract_epi16(screenmax, 1)+1;
5024 if (starty >= maxy1 && endy <= miny2)
5026 screeny = _mm_srai_epi32(screeni, 16);
5029 triangle = &thread->triangles[thread->numtriangles];
5031 // calculate attribute plans for triangle data...
5032 // okay, this triangle is going to produce spans, we'd better project
5033 // the interpolants now (this is what gives perspective texturing),
5034 // this consists of simply multiplying all arrays by the W coord
5035 // (which is basically 1/Z), which will be undone per-pixel
5036 // (multiplying by Z again) to get the perspective-correct array
5039 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5040 __m128 mipedgescale, mipdensity;
5041 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5042 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5043 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5044 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5045 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5046 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5047 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5048 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5049 attribedge1 = _mm_sub_ss(w0, w1);
5050 attribedge2 = _mm_sub_ss(w2, w1);
5051 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5052 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5053 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5054 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5055 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5056 _mm_store_ss(&triangle->w[0], attribxslope);
5057 _mm_store_ss(&triangle->w[1], attribyslope);
5058 _mm_store_ss(&triangle->w[2], attriborigin);
5063 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5065 float cliporigin, clipxslope, clipyslope;
5066 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5067 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5068 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5069 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5070 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5071 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5072 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5073 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5074 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5077 clip0origin = -cliporigin/clipxslope;
5078 clip0slope = -clipyslope/clipxslope;
5079 clip0dir = clipxslope > 0 ? 1 : -1;
5081 else if(clipyslope > 0)
5083 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5084 clip0slope = dpsoftrast.fb_width;
5087 else if(clipyslope < 0)
5089 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5090 clip0slope = -dpsoftrast.fb_width;
5093 else if(clip0origin < 0) continue;
5096 mipedgescale = _mm_setzero_ps();
5097 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5099 __m128 attrib0, attrib1, attrib2;
5100 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5101 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5103 arrays += numvertices*4;
5104 GENATTRIBS(attrib0, attrib1, attrib2);
5105 attriborigin = _mm_mul_ps(attrib1, w1);
5106 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5107 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5108 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5109 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5110 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5111 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5112 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5113 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5114 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5116 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5117 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5118 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5119 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5123 memset(triangle->mip, 0, sizeof(triangle->mip));
5124 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5126 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5127 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5129 texture = thread->texbound[texunit];
5130 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5132 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5133 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5134 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5135 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5136 // this will be multiplied in the texturing routine by the texture resolution
5137 y = _mm_cvtss_si32(mipdensity);
5140 y = (int)(log((float)y)*0.5f/M_LN2);
5141 if (y > texture->mipmaps - 1)
5142 y = texture->mipmaps - 1;
5143 triangle->mip[texunit] = y;
5149 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5152 __m128 xcoords, xslope;
5153 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5154 int yccmask = _mm_movemask_epi8(ycc);
5155 int edge0p, edge0n, edge1p, edge1n;
5164 case 0xFFFF: /*0000*/ y = endy; continue;
5165 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5166 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5167 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5168 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5169 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5170 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5171 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5172 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5173 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5174 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5175 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5176 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5177 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5178 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5179 case 0x0000: /*1111*/ y++; continue;
5187 case 0xFFFF: /*000*/ y = endy; continue;
5188 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5189 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5190 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5191 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5192 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5193 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5194 case 0x0000: /*111*/ y++; continue;
5197 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5198 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5199 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5200 nexty = _mm_extract_epi16(ycc, 0);
5201 if (nexty >= bandy) nexty = bandy-1;
5202 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5203 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5204 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5205 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5206 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5207 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5209 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5210 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5212 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5213 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5215 int startx, endx, offset;
5216 startx = _mm_cvtss_si32(xcoords);
5217 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5218 if (startx < minx) startx = minx;
5219 if (endx > maxx) endx = maxx;
5220 if (startx >= endx) continue;
5228 if(endx <= clip0) continue;
5229 startx = (int)clip0;
5232 else if (endx > clip0)
5234 if(startx >= clip0) continue;
5239 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5241 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5242 span->triangle = thread->numtriangles;
5246 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5247 if (span->startx >= span->endx)
5249 wslope = triangle->w[0];
5250 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5251 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5252 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5253 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5254 DPSOFTRAST_Draw_ProcessSpans(thread);
5259 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5261 DPSOFTRAST_Draw_ProcessSpans(thread);
5262 thread->numtriangles = 0;
5266 if (!ATOMIC_DECREMENT(command->refcount))
5268 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5269 MM_FREE(command->arrays);
5272 if (thread->numspans > 0 || thread->numtriangles > 0)
5274 DPSOFTRAST_Draw_ProcessSpans(thread);
5275 thread->numtriangles = 0;
5280 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5284 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5285 int datasize = 2*numvertices*sizeof(float[4]);
5286 DPSOFTRAST_Command_Draw *command;
5287 unsigned char *data;
5288 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5290 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5291 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5293 datasize += numvertices*sizeof(float[4]);
5296 datasize += numtriangles*sizeof(unsigned short[3]);
5298 datasize += numtriangles*sizeof(int[3]);
5299 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5300 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5302 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5303 data = (unsigned char *)MM_CALLOC(datasize, 1);
5307 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5308 data = (unsigned char *)command + commandsize;
5310 command->firstvertex = firstvertex;
5311 command->numvertices = numvertices;
5312 command->numtriangles = numtriangles;
5313 command->arrays = (float *)data;
5314 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5315 dpsoftrast.firstvertex = firstvertex;
5316 dpsoftrast.numvertices = numvertices;
5317 dpsoftrast.screencoord4f = (float *)data;
5318 data += numvertices*sizeof(float[4]);
5319 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5320 data += numvertices*sizeof(float[4]);
5321 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5323 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5324 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5326 dpsoftrast.post_array4f[j] = (float *)data;
5327 data += numvertices*sizeof(float[4]);
5329 command->element3i = NULL;
5330 command->element3s = NULL;
5333 command->element3s = (unsigned short *)data;
5334 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5338 command->element3i = (int *)data;
5339 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5344 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5346 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5347 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5348 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5349 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5350 if (command->starty >= command->endy)
5352 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5353 MM_FREE(command->arrays);
5354 DPSOFTRAST_UndoCommand(command->commandsize);
5357 command->clipped = dpsoftrast.drawclipped;
5358 command->refcount = dpsoftrast.numthreads;
5360 if (dpsoftrast.usethreads)
5363 DPSOFTRAST_Draw_SyncCommands();
5364 for (i = 0; i < dpsoftrast.numthreads; i++)
5366 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5367 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5368 Thread_CondSignal(thread->drawcond);
5373 DPSOFTRAST_Draw_FlushThreads();
5377 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5378 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5380 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5382 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5384 DPSOFTRAST_Command_SetRenderTargets *command;
5385 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5386 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5387 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5389 dpsoftrast.fb_width = width;
5390 dpsoftrast.fb_height = height;
5391 dpsoftrast.fb_depthpixels = depthpixels;
5392 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5393 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5394 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5395 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5396 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5397 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5398 command->width = width;
5399 command->height = height;
5402 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5404 int commandoffset = thread->commandoffset;
5405 while (commandoffset != endoffset)
5407 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5408 switch (command->opcode)
5410 #define INTERPCOMMAND(name) \
5411 case DPSOFTRAST_OPCODE_##name : \
5412 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5413 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5414 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5415 commandoffset = 0; \
5417 INTERPCOMMAND(Viewport)
5418 INTERPCOMMAND(ClearColor)
5419 INTERPCOMMAND(ClearDepth)
5420 INTERPCOMMAND(ColorMask)
5421 INTERPCOMMAND(DepthTest)
5422 INTERPCOMMAND(ScissorTest)
5423 INTERPCOMMAND(Scissor)
5424 INTERPCOMMAND(BlendFunc)
5425 INTERPCOMMAND(BlendSubtract)
5426 INTERPCOMMAND(DepthMask)
5427 INTERPCOMMAND(DepthFunc)
5428 INTERPCOMMAND(DepthRange)
5429 INTERPCOMMAND(PolygonOffset)
5430 INTERPCOMMAND(CullFace)
5431 INTERPCOMMAND(AlphaTest)
5432 INTERPCOMMAND(AlphaFunc)
5433 INTERPCOMMAND(SetTexture)
5434 INTERPCOMMAND(SetShader)
5435 INTERPCOMMAND(Uniform4f)
5436 INTERPCOMMAND(UniformMatrix4f)
5437 INTERPCOMMAND(Uniform1i)
5438 INTERPCOMMAND(SetRenderTargets)
5439 INTERPCOMMAND(ClipPlane)
5441 case DPSOFTRAST_OPCODE_Draw:
5442 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5443 commandoffset += command->commandsize;
5444 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5446 thread->commandoffset = commandoffset;
5449 case DPSOFTRAST_OPCODE_Reset:
5454 thread->commandoffset = commandoffset;
5457 static int DPSOFTRAST_Draw_Thread(void *data)
5459 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5460 while(thread->index >= 0)
5462 if (thread->commandoffset != dpsoftrast.drawcommand)
5464 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5468 Thread_LockMutex(thread->drawmutex);
5469 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5471 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5472 thread->starving = true;
5473 Thread_CondWait(thread->drawcond, thread->drawmutex);
5474 thread->starving = false;
5476 Thread_UnlockMutex(thread->drawmutex);
5482 static void DPSOFTRAST_Draw_FlushThreads(void)
5484 DPSOFTRAST_State_Thread *thread;
5486 DPSOFTRAST_Draw_SyncCommands();
5487 if (dpsoftrast.usethreads)
5489 for (i = 0; i < dpsoftrast.numthreads; i++)
5491 thread = &dpsoftrast.threads[i];
5492 if (thread->commandoffset != dpsoftrast.drawcommand)
5494 Thread_LockMutex(thread->drawmutex);
5495 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5496 Thread_CondSignal(thread->drawcond);
5497 Thread_UnlockMutex(thread->drawmutex);
5500 for (i = 0; i < dpsoftrast.numthreads; i++)
5502 thread = &dpsoftrast.threads[i];
5503 if (thread->commandoffset != dpsoftrast.drawcommand)
5505 Thread_LockMutex(thread->drawmutex);
5506 if (thread->commandoffset != dpsoftrast.drawcommand)
5508 thread->waiting = true;
5509 Thread_CondWait(thread->waitcond, thread->drawmutex);
5510 thread->waiting = false;
5512 Thread_UnlockMutex(thread->drawmutex);
5518 for (i = 0; i < dpsoftrast.numthreads; i++)
5520 thread = &dpsoftrast.threads[i];
5521 if (thread->commandoffset != dpsoftrast.drawcommand)
5522 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5525 dpsoftrast.commandpool.usedcommands = 0;
5528 void DPSOFTRAST_Flush(void)
5530 DPSOFTRAST_Draw_FlushThreads();
5533 void DPSOFTRAST_Finish(void)
5538 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5548 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5549 dpsoftrast.bigendian = u.b[3];
5550 dpsoftrast.fb_width = width;
5551 dpsoftrast.fb_height = height;
5552 dpsoftrast.fb_depthpixels = depthpixels;
5553 dpsoftrast.fb_colorpixels[0] = colorpixels;
5554 dpsoftrast.fb_colorpixels[1] = NULL;
5555 dpsoftrast.fb_colorpixels[1] = NULL;
5556 dpsoftrast.fb_colorpixels[1] = NULL;
5557 dpsoftrast.viewport[0] = 0;
5558 dpsoftrast.viewport[1] = 0;
5559 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5560 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5561 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5562 dpsoftrast.texture_firstfree = 1;
5563 dpsoftrast.texture_end = 1;
5564 dpsoftrast.texture_max = 0;
5565 dpsoftrast.color[0] = 1;
5566 dpsoftrast.color[1] = 1;
5567 dpsoftrast.color[2] = 1;
5568 dpsoftrast.color[3] = 1;
5569 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5570 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5571 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5572 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5573 for (i = 0; i < dpsoftrast.numthreads; i++)
5575 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5577 thread->cullface = GL_BACK;
5578 thread->colormask[0] = 1;
5579 thread->colormask[1] = 1;
5580 thread->colormask[2] = 1;
5581 thread->colormask[3] = 1;
5582 thread->blendfunc[0] = GL_ONE;
5583 thread->blendfunc[1] = GL_ZERO;
5584 thread->depthmask = true;
5585 thread->depthtest = true;
5586 thread->depthfunc = GL_LEQUAL;
5587 thread->scissortest = false;
5588 thread->alphatest = false;
5589 thread->alphafunc = GL_GREATER;
5590 thread->alphavalue = 0.5f;
5591 thread->viewport[0] = 0;
5592 thread->viewport[1] = 0;
5593 thread->viewport[2] = dpsoftrast.fb_width;
5594 thread->viewport[3] = dpsoftrast.fb_height;
5595 thread->scissor[0] = 0;
5596 thread->scissor[1] = 0;
5597 thread->scissor[2] = dpsoftrast.fb_width;
5598 thread->scissor[3] = dpsoftrast.fb_height;
5599 thread->depthrange[0] = 0;
5600 thread->depthrange[1] = 1;
5601 thread->polygonoffset[0] = 0;
5602 thread->polygonoffset[1] = 0;
5603 thread->clipplane[0] = 0;
5604 thread->clipplane[1] = 0;
5605 thread->clipplane[2] = 0;
5606 thread->clipplane[3] = 1;
5608 thread->numspans = 0;
5609 thread->numtriangles = 0;
5610 thread->commandoffset = 0;
5611 thread->waiting = false;
5612 thread->starving = false;
5614 thread->validate = -1;
5615 DPSOFTRAST_Validate(thread, -1);
5617 if (dpsoftrast.usethreads)
5619 thread->waitcond = Thread_CreateCond();
5620 thread->drawcond = Thread_CreateCond();
5621 thread->drawmutex = Thread_CreateMutex();
5622 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5628 void DPSOFTRAST_Shutdown(void)
5631 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5633 DPSOFTRAST_State_Thread *thread;
5634 for (i = 0; i < dpsoftrast.numthreads; i++)
5636 thread = &dpsoftrast.threads[i];
5637 Thread_LockMutex(thread->drawmutex);
5639 Thread_CondSignal(thread->drawcond);
5640 Thread_UnlockMutex(thread->drawmutex);
5641 Thread_WaitThread(thread->thread, 0);
5642 Thread_DestroyCond(thread->waitcond);
5643 Thread_DestroyCond(thread->drawcond);
5644 Thread_DestroyMutex(thread->drawmutex);
5647 for (i = 0;i < dpsoftrast.texture_end;i++)
5648 if (dpsoftrast.texture[i].bytes)
5649 MM_FREE(dpsoftrast.texture[i].bytes);
5650 if (dpsoftrast.texture)
5651 free(dpsoftrast.texture);
5652 if (dpsoftrast.threads)
5653 MM_FREE(dpsoftrast.threads);
5654 memset(&dpsoftrast, 0, sizeof(dpsoftrast));