3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192 int depthslope; // depthbuffer value pixel delta
194 DPSOFTRAST_State_Span);
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
205 typedef enum DPSOFTRAST_BLENDMODE_e
207 DPSOFTRAST_BLENDMODE_OPAQUE,
208 DPSOFTRAST_BLENDMODE_ALPHA,
209 DPSOFTRAST_BLENDMODE_ADDALPHA,
210 DPSOFTRAST_BLENDMODE_ADD,
211 DPSOFTRAST_BLENDMODE_INVMOD,
212 DPSOFTRAST_BLENDMODE_MUL,
213 DPSOFTRAST_BLENDMODE_MUL2,
214 DPSOFTRAST_BLENDMODE_SUBALPHA,
215 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216 DPSOFTRAST_BLENDMODE_INVADD,
217 DPSOFTRAST_BLENDMODE_TOTAL
219 DPSOFTRAST_BLENDMODE;
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
240 float polygonoffset[2];
242 ALIGN(float fb_clipplane[4]);
245 int shader_permutation;
246 int shader_exactspecularmath;
248 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
250 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
253 // DPSOFTRAST_VALIDATE_ flags
256 // derived values (DPSOFTRAST_VALIDATE_FB)
259 ALIGN(float fb_viewportcenter[4]);
260 ALIGN(float fb_viewportscale[4]);
262 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
265 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
274 ATOMIC(volatile int commandoffset);
276 volatile bool waiting;
277 volatile bool starving;
284 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
288 DPSOFTRAST_State_Thread);
290 typedef ATOMIC(struct DPSOFTRAST_State_s
294 unsigned int *fb_depthpixels;
295 unsigned int *fb_colorpixels[4];
298 ALIGN(float fb_viewportcenter[4]);
299 ALIGN(float fb_viewportscale[4]);
302 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
305 const float *pointer_vertex3f;
306 const float *pointer_color4f;
307 const unsigned char *pointer_color4ub;
308 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
311 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
317 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318 float *screencoord4f;
324 int shader_permutation;
325 int shader_exactspecularmath;
329 int texture_firstfree;
330 DPSOFTRAST_Texture *texture;
335 const char *errorstring;
340 DPSOFTRAST_State_Thread *threads;
342 ATOMIC(volatile int drawcommand);
344 DPSOFTRAST_State_Command_Pool commandpool;
348 DPSOFTRAST_State dpsoftrast;
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
360 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362 fb_viewportcenter[3] = 0.5f;
363 fb_viewportcenter[0] = 0.0f;
364 fb_viewportscale[1] = 0.5f * viewport[2];
365 fb_viewportscale[2] = -0.5f * viewport[3];
366 fb_viewportscale[3] = 0.5f;
367 fb_viewportscale[0] = 1.0f;
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
372 if (dpsoftrast.interlace)
374 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
388 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
397 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398 // and viewport projection values
401 x1 = thread->scissor[0];
402 x2 = thread->scissor[0] + thread->scissor[2];
403 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404 y2 = dpsoftrast.fb_height - thread->scissor[1];
405 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
407 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
409 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410 thread->fb_scissor[0] = x1;
411 thread->fb_scissor[1] = y1;
412 thread->fb_scissor[2] = x2 - x1;
413 thread->fb_scissor[3] = y2 - y1;
415 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416 DPSOFTRAST_RecalcClipPlane(thread);
417 DPSOFTRAST_RecalcThread(thread);
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
422 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
427 if (thread->blendsubtract)
429 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
431 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
441 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
460 mask &= thread->validate;
463 if (mask & DPSOFTRAST_VALIDATE_FB)
465 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466 DPSOFTRAST_RecalcFB(thread);
468 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
470 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471 DPSOFTRAST_RecalcDepthFunc(thread);
473 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
475 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476 DPSOFTRAST_RecalcBlendFunc(thread);
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
482 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483 return &dpsoftrast.texture[index];
487 static void DPSOFTRAST_Texture_Grow(void)
489 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490 DPSOFTRAST_State_Thread *thread;
494 // expand texture array as needed
495 if (dpsoftrast.texture_max < 1024)
496 dpsoftrast.texture_max = 1024;
498 dpsoftrast.texture_max *= 2;
499 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501 if (dpsoftrast.texbound[i])
502 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503 for (j = 0; j < dpsoftrast.numthreads; j++)
505 thread = &dpsoftrast.threads[j];
506 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507 if (thread->texbound[i])
508 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
521 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523 DPSOFTRAST_Texture *texture;
524 if (width*height*depth < 1)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
529 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
540 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
560 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
563 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
565 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
568 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
570 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
573 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
578 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
583 // find first empty slot in texture array
584 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585 if (!dpsoftrast.texture[texnum].bytes)
587 dpsoftrast.texture_firstfree = texnum + 1;
588 if (dpsoftrast.texture_max <= texnum)
589 DPSOFTRAST_Texture_Grow();
590 if (dpsoftrast.texture_end <= texnum)
591 dpsoftrast.texture_end = texnum + 1;
592 texture = &dpsoftrast.texture[texnum];
593 memset(texture, 0, sizeof(*texture));
594 texture->flags = flags;
595 texture->width = width;
596 texture->height = height;
597 texture->depth = depth;
598 texture->sides = sides;
610 s = w * h * d * sides * 4;
611 texture->mipmap[mipmaps][0] = size;
612 texture->mipmap[mipmaps][1] = s;
613 texture->mipmap[mipmaps][2] = w;
614 texture->mipmap[mipmaps][3] = h;
615 texture->mipmap[mipmaps][4] = d;
618 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
624 texture->mipmaps = mipmaps;
625 texture->size = size;
627 // allocate the pixels now
628 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
632 void DPSOFTRAST_Texture_Free(int index)
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
639 MM_FREE(texture->bytes);
640 texture->bytes = NULL;
641 memset(texture, 0, sizeof(*texture));
642 // adjust the free range and used range
643 if (dpsoftrast.texture_firstfree > index)
644 dpsoftrast.texture_firstfree = index;
645 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646 dpsoftrast.texture_end--;
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
650 int i, x, y, z, w, layer0, layer1, row0, row1;
651 unsigned char *o, *i0, *i1, *i2, *i3;
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 if (texture->mipmaps <= 1)
656 for (i = 1;i < texture->mipmaps;i++)
658 for (z = 0;z < texture->mipmap[i][4];z++)
662 if (layer1 >= texture->mipmap[i-1][4])
663 layer1 = texture->mipmap[i-1][4]-1;
664 for (y = 0;y < texture->mipmap[i][3];y++)
668 if (row1 >= texture->mipmap[i-1][3])
669 row1 = texture->mipmap[i-1][3]-1;
670 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
671 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675 w = texture->mipmap[i][2];
678 if (texture->mipmap[i-1][2] > 1)
680 // average 3D texture
681 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
683 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691 // average 3D mipmap with parent width == 1
692 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
694 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
703 if (texture->mipmap[i-1][2] > 1)
705 // average 2D texture (common case)
706 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716 // 2D texture with parent width == 1
717 o[0] = (i0[0] + i1[0] + 1) >> 1;
718 o[1] = (i0[1] + i1[1] + 1) >> 1;
719 o[2] = (i0[2] + i1[2] + 1) >> 1;
720 o[3] = (i0[3] + i1[3] + 1) >> 1;
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
729 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737 while (blockheight > 0)
739 memcpy(dst, pixels, blockwidth * 4);
740 pixels += blockwidth * 4;
741 dst += texture->mipmap[0][2] * 4;
745 DPSOFTRAST_Texture_CalculateMipmaps(index);
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
754 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755 DPSOFTRAST_Texture_CalculateMipmaps(index);
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761 return texture->mipmap[mip][2];
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
765 DPSOFTRAST_Texture *texture;
766 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767 return texture->mipmap[mip][3];
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
771 DPSOFTRAST_Texture *texture;
772 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773 return texture->mipmap[mip][4];
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
777 DPSOFTRAST_Texture *texture;
778 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
781 return texture->bytes + texture->mipmap[mip][0];
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
785 DPSOFTRAST_Texture *texture;
786 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
789 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794 texture->filter = filter;
797 static void DPSOFTRAST_Draw_FlushThreads(void);
799 static void DPSOFTRAST_Draw_SyncCommands(void)
801 if(dpsoftrast.usethreads) MEMORY_BARRIER;
802 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
807 DPSOFTRAST_State_Thread *thread;
809 int freecommand = dpsoftrast.commandpool.freecommand;
810 int usedcommands = dpsoftrast.commandpool.usedcommands;
811 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
813 DPSOFTRAST_Draw_SyncCommands();
819 for (i = 0; i < dpsoftrast.numthreads; i++)
821 thread = &dpsoftrast.threads[i];
822 commandoffset = freecommand - thread->commandoffset;
823 if (commandoffset < 0)
824 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825 if (commandoffset > usedcommands)
828 usedcommands = commandoffset;
831 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
833 thread = &dpsoftrast.threads[waitindex];
834 Thread_LockMutex(thread->drawmutex);
835 if (thread->commandoffset != dpsoftrast.drawcommand)
837 thread->waiting = true;
838 if (thread->starving) Thread_CondSignal(thread->drawcond);
839 Thread_CondWait(thread->waitcond, thread->drawmutex);
840 thread->waiting = false;
842 Thread_UnlockMutex(thread->drawmutex);
844 dpsoftrast.commandpool.usedcommands = usedcommands;
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
854 DPSOFTRAST_Command *command;
855 int freecommand = dpsoftrast.commandpool.freecommand;
856 int usedcommands = dpsoftrast.commandpool.usedcommands;
857 int extra = sizeof(DPSOFTRAST_Command);
858 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
862 if (dpsoftrast.usethreads)
863 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
865 DPSOFTRAST_Draw_FlushThreads();
866 freecommand = dpsoftrast.commandpool.freecommand;
867 usedcommands = dpsoftrast.commandpool.usedcommands;
869 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
871 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872 command->opcode = DPSOFTRAST_OPCODE_Reset;
873 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
876 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877 command->opcode = opcode;
878 command->commandsize = size;
880 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
882 dpsoftrast.commandpool.freecommand = freecommand;
883 dpsoftrast.commandpool.usedcommands = usedcommands + size;
887 static void DPSOFTRAST_UndoCommand(int size)
889 int freecommand = dpsoftrast.commandpool.freecommand;
890 int usedcommands = dpsoftrast.commandpool.usedcommands;
893 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894 usedcommands -= size;
895 dpsoftrast.commandpool.freecommand = freecommand;
896 dpsoftrast.commandpool.usedcommands = usedcommands;
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
902 thread->viewport[0] = command->x;
903 thread->viewport[1] = command->y;
904 thread->viewport[2] = command->width;
905 thread->viewport[3] = command->height;
906 thread->validate |= DPSOFTRAST_VALIDATE_FB;
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
910 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913 command->width = width;
914 command->height = height;
916 dpsoftrast.viewport[0] = x;
917 dpsoftrast.viewport[1] = y;
918 dpsoftrast.viewport[2] = width;
919 dpsoftrast.viewport[3] = height;
920 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
926 int i, x1, y1, x2, y2, w, h, x, y;
927 int miny1, maxy1, miny2, maxy2;
931 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932 miny1 = thread->miny1;
933 maxy1 = thread->maxy1;
934 miny2 = thread->miny2;
935 maxy2 = thread->maxy2;
936 x1 = thread->fb_scissor[0];
937 y1 = thread->fb_scissor[1];
938 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940 if (y1 < miny1) y1 = miny1;
941 if (y2 > maxy2) y2 = maxy2;
946 // FIXME: honor fb_colormask?
947 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948 for (i = 0;i < 4;i++)
950 if (!dpsoftrast.fb_colorpixels[i])
952 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
955 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
963 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
973 int x1, y1, x2, y2, w, h, x, y;
974 int miny1, maxy1, miny2, maxy2;
978 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979 miny1 = thread->miny1;
980 maxy1 = thread->maxy1;
981 miny2 = thread->miny2;
982 maxy2 = thread->maxy2;
983 x1 = thread->fb_scissor[0];
984 y1 = thread->fb_scissor[1];
985 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987 if (y1 < miny1) y1 = miny1;
988 if (y2 > maxy2) y2 = maxy2;
993 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
997 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998 for (x = x1;x < x2;x++)
1002 void DPSOFTRAST_ClearDepth(float d)
1004 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1011 thread->colormask[0] = command->r != 0;
1012 thread->colormask[1] = command->g != 0;
1013 thread->colormask[2] = command->b != 0;
1014 thread->colormask[3] = command->a != 0;
1015 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1019 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1029 thread->depthtest = command->enable;
1030 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1032 void DPSOFTRAST_DepthTest(int enable)
1034 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035 command->enable = enable;
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1041 thread->scissortest = command->enable;
1042 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1044 void DPSOFTRAST_ScissorTest(int enable)
1046 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047 command->enable = enable;
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1053 thread->scissor[0] = command->x;
1054 thread->scissor[1] = command->y;
1055 thread->scissor[2] = command->width;
1056 thread->scissor[3] = command->height;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1061 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1064 command->width = width;
1065 command->height = height;
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1071 thread->blendfunc[0] = command->sfactor;
1072 thread->blendfunc[1] = command->dfactor;
1073 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1077 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078 command->sfactor = sfactor;
1079 command->dfactor = dfactor;
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1085 thread->blendsubtract = command->enable;
1086 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1088 void DPSOFTRAST_BlendSubtract(int enable)
1090 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091 command->enable = enable;
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1097 thread->depthmask = command->enable;
1099 void DPSOFTRAST_DepthMask(int enable)
1101 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102 command->enable = enable;
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1108 thread->depthfunc = command->func;
1110 void DPSOFTRAST_DepthFunc(int func)
1112 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113 command->func = func;
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1119 thread->depthrange[0] = command->nearval;
1120 thread->depthrange[1] = command->farval;
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1124 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125 command->nearval = nearval;
1126 command->farval = farval;
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1132 thread->polygonoffset[0] = command->alongnormal;
1133 thread->polygonoffset[1] = command->intoview;
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1137 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138 command->alongnormal = alongnormal;
1139 command->intoview = intoview;
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1145 thread->cullface = command->mode;
1147 void DPSOFTRAST_CullFace(int mode)
1149 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150 command->mode = mode;
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1156 thread->alphatest = command->enable;
1158 void DPSOFTRAST_AlphaTest(int enable)
1160 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161 command->enable = enable;
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1167 thread->alphafunc = command->func;
1168 thread->alphavalue = command->ref;
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1172 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173 command->func = func;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - 1 - sy1;
1282 for (y = 0;y < th;y++)
1283 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284 if (texture->mipmaps > 1)
1285 DPSOFTRAST_Texture_CalculateMipmaps(index);
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 if (thread->texbound[command->unitnum])
1292 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293 thread->texbound[command->unitnum] = command->texture;
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 DPSOFTRAST_Command_SetTexture *command;
1298 DPSOFTRAST_Texture *texture;
1299 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1304 texture = DPSOFTRAST_Texture_GetByIndex(index);
1305 if (index && !texture)
1307 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1311 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312 command->unitnum = unitnum;
1313 command->texture = texture;
1315 dpsoftrast.texbound[unitnum] = texture;
1316 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1321 dpsoftrast.pointer_vertex3f = vertex3f;
1322 dpsoftrast.stride_vertex = stride;
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1326 dpsoftrast.pointer_color4f = color4f;
1327 dpsoftrast.pointer_color4ub = NULL;
1328 dpsoftrast.stride_color = stride;
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1332 dpsoftrast.pointer_color4f = NULL;
1333 dpsoftrast.pointer_color4ub = color4ub;
1334 dpsoftrast.stride_color = stride;
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1338 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340 dpsoftrast.stride_texcoord[unitnum] = stride;
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1346 thread->shader_mode = command->mode;
1347 thread->shader_permutation = command->permutation;
1348 thread->shader_exactspecularmath = command->exactspecularmath;
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1352 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353 command->mode = mode;
1354 command->permutation = permutation;
1355 command->exactspecularmath = exactspecularmath;
1357 dpsoftrast.shader_mode = mode;
1358 dpsoftrast.shader_permutation = permutation;
1359 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1365 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1369 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370 command->index = index;
1371 command->val[0] = v0;
1372 command->val[1] = v1;
1373 command->val[2] = v2;
1374 command->val[3] = v3;
1376 dpsoftrast.uniform4f[index*4+0] = v0;
1377 dpsoftrast.uniform4f[index*4+1] = v1;
1378 dpsoftrast.uniform4f[index*4+2] = v2;
1379 dpsoftrast.uniform4f[index*4+3] = v3;
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1383 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384 command->index = index;
1385 memcpy(command->val, v, sizeof(command->val));
1387 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1393 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1399 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1401 __m128 m0, m1, m2, m3;
1402 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403 command->index = (DPSOFTRAST_UNIFORM)index;
1404 if (((size_t)v)&(ALIGN_SIZE-1))
1406 m0 = _mm_loadu_ps(v);
1407 m1 = _mm_loadu_ps(v+4);
1408 m2 = _mm_loadu_ps(v+8);
1409 m3 = _mm_loadu_ps(v+12);
1413 m0 = _mm_load_ps(v);
1414 m1 = _mm_load_ps(v+4);
1415 m2 = _mm_load_ps(v+8);
1416 m3 = _mm_load_ps(v+12);
1420 __m128 t0, t1, t2, t3;
1421 t0 = _mm_unpacklo_ps(m0, m1);
1422 t1 = _mm_unpacklo_ps(m2, m3);
1423 t2 = _mm_unpackhi_ps(m0, m1);
1424 t3 = _mm_unpackhi_ps(m2, m3);
1425 m0 = _mm_movelh_ps(t0, t1);
1426 m1 = _mm_movehl_ps(t1, t0);
1427 m2 = _mm_movelh_ps(t2, t3);
1428 m3 = _mm_movehl_ps(t3, t2);
1430 _mm_store_ps(command->val, m0);
1431 _mm_store_ps(command->val+4, m1);
1432 _mm_store_ps(command->val+8, m2);
1433 _mm_store_ps(command->val+12, m3);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1445 thread->uniform1i[command->index] = command->val;
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1449 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450 command->index = index;
1453 dpsoftrast.uniform1i[command->index] = i0;
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1459 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1464 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465 command->clipplane[0] = x;
1466 command->clipplane[1] = y;
1467 command->clipplane[2] = z;
1468 command->clipplane[3] = w;
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1474 float *end = dst + size*4;
1475 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1479 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1488 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1497 float *end = dst + size*4;
1498 if (stride == sizeof(float[3]))
1500 float *end4 = dst + (size&~3)*4;
1501 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1505 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1506 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1519 src += 4*sizeof(float[3]);
1526 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1540 src += 4*sizeof(float[3]);
1544 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1548 __m128 v = _mm_loadu_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565 _mm_store_ps(dst, v);
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 float *end = dst + size*4;
1575 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576 if (stride == sizeof(float[2]))
1578 float *end2 = dst + (size&~1)*4;
1579 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1583 __m128 v = _mm_loadu_ps((const float *)src);
1584 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1587 src += 2*sizeof(float[2]);
1594 __m128 v = _mm_load_ps((const float *)src);
1595 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1598 src += 2*sizeof(float[2]);
1604 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1612 float *end = dst + size*4;
1613 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614 if (stride == sizeof(unsigned char[4]))
1616 float *end4 = dst + (size&~3)*4;
1617 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1621 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1634 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640 src += 4*sizeof(unsigned char[4]);
1646 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1655 float *end = dst + 4*size;
1656 __m128 v = _mm_loadu_ps(src);
1659 _mm_store_ps(dst, v);
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669 __m128 m0, m1, m2, m3;
1671 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1673 // fast case for identity matrix
1674 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677 end = out4f + numitems*4;
1678 m0 = _mm_loadu_ps(inmatrix16f);
1679 m1 = _mm_loadu_ps(inmatrix16f + 4);
1680 m2 = _mm_loadu_ps(inmatrix16f + 8);
1681 m3 = _mm_loadu_ps(inmatrix16f + 12);
1682 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1686 __m128 v = _mm_loadu_ps(in4f);
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 __m128 v = _mm_load_ps(in4f);
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1738 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 int clipmask = 0xFF;
1747 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755 #define BBFRONT(k, pos) \
1757 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1762 clipmask &= ~(1<<k); \
1763 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764 minproj = _mm_min_ss(minproj, proj); \
1765 maxproj = _mm_max_ss(maxproj, proj); \
1769 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1770 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1771 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1772 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1773 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1778 if (clipmask&(1<<k)) \
1780 if (!(clipmask&(1<<(k^1)))) \
1782 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785 minproj = _mm_min_ss(minproj, proj); \
1786 maxproj = _mm_max_ss(maxproj, proj); \
1788 if (!(clipmask&(1<<(k^2)))) \
1790 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793 minproj = _mm_min_ss(minproj, proj); \
1794 maxproj = _mm_max_ss(maxproj, proj); \
1796 if (!(clipmask&(1<<(k^4)))) \
1798 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801 minproj = _mm_min_ss(minproj, proj); \
1802 maxproj = _mm_max_ss(maxproj, proj); \
1806 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813 *starty = _mm_cvttss_si32(maxproj);
1814 *endy = _mm_cvttss_si32(minproj)+1;
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821 float *end = out4f + numitems*4;
1822 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823 __m128 minpos, maxpos;
1824 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826 minpos = maxpos = _mm_loadu_ps(in4f);
1829 __m128 v = _mm_loadu_ps(in4f);
1830 minpos = _mm_min_ps(minpos, v);
1831 maxpos = _mm_max_ps(maxpos, v);
1832 _mm_store_ps(out4f, v);
1833 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834 _mm_store_ps(screen4f, v);
1842 minpos = maxpos = _mm_load_ps(in4f);
1845 __m128 v = _mm_load_ps(in4f);
1846 minpos = _mm_min_ps(minpos, v);
1847 maxpos = _mm_max_ps(maxpos, v);
1848 _mm_store_ps(out4f, v);
1849 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850 _mm_store_ps(screen4f, v);
1858 ALIGN(float minposf[4]);
1859 ALIGN(float maxposf[4]);
1860 _mm_store_ps(minposf, minpos);
1861 _mm_store_ps(maxposf, maxpos);
1862 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874 end = out4f + numitems*4;
1875 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877 m0 = _mm_loadu_ps(inmatrix16f);
1878 m1 = _mm_loadu_ps(inmatrix16f + 4);
1879 m2 = _mm_loadu_ps(inmatrix16f + 8);
1880 m3 = _mm_loadu_ps(inmatrix16f + 12);
1881 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883 minpos = maxpos = _mm_loadu_ps(in4f);
1886 __m128 v = _mm_loadu_ps(in4f);
1887 minpos = _mm_min_ps(minpos, v);
1888 maxpos = _mm_max_ps(maxpos, v);
1889 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890 _mm_store_ps(out4f, v);
1891 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892 _mm_store_ps(screen4f, v);
1900 minpos = maxpos = _mm_load_ps(in4f);
1903 __m128 v = _mm_load_ps(in4f);
1904 minpos = _mm_min_ps(minpos, v);
1905 maxpos = _mm_max_ps(maxpos, v);
1906 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907 _mm_store_ps(out4f, v);
1908 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909 _mm_store_ps(screen4f, v);
1917 ALIGN(float minposf[4]);
1918 ALIGN(float maxposf[4]);
1919 _mm_store_ps(minposf, minpos);
1920 _mm_store_ps(maxposf, maxpos);
1921 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1930 float *outf = dpsoftrast.post_array4f[outarray];
1931 const unsigned char *inb;
1932 int firstvertex = dpsoftrast.firstvertex;
1933 int numvertices = dpsoftrast.numvertices;
1937 case DPSOFTRAST_ARRAY_POSITION:
1938 stride = dpsoftrast.stride_vertex;
1939 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942 case DPSOFTRAST_ARRAY_COLOR:
1943 stride = dpsoftrast.stride_color;
1944 if (dpsoftrast.pointer_color4f)
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949 else if (dpsoftrast.pointer_color4ub)
1951 stride = dpsoftrast.stride_color;
1952 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1957 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1961 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1971 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1974 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1997 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2009 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2020 int startx = span->startx;
2021 int endx = span->endx;
2022 float wslope = triangle->w[0];
2023 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024 float endz = 1.0f / (w + wslope * startx);
2025 if (triangle->w[0] == 0)
2027 // LordHavoc: fast flat polygons (HUD/menu)
2028 for (x = startx;x < endx;x++)
2032 for (x = startx;x < endx;)
2034 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036 if (nextsub >= endx) nextsub = endsub = endx-1;
2037 endz = 1.0f / (w + wslope * nextsub);
2038 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039 for (; x <= endsub; x++, z += dz)
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2048 int startx = span->startx;
2049 int endx = span->endx;
2051 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052 unsigned char * RESTRICT pixelmask = span->pixelmask;
2053 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2057 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058 pixeli += span->y * dpsoftrast.fb_width + span->x;
2059 // handle alphatest now (this affects depth writes too)
2060 if (thread->alphatest)
2061 for (x = startx;x < endx;x++)
2062 if (in4ub[x*4+3] < 128)
2063 pixelmask[x] = false;
2064 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065 // helps sprites, text and hud artwork
2066 switch(thread->fb_blendmode)
2068 case DPSOFTRAST_BLENDMODE_ALPHA:
2069 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071 for (x = startx;x < endx;x++)
2072 if (in4ub[x*4+3] < 1)
2073 pixelmask[x] = false;
2075 case DPSOFTRAST_BLENDMODE_OPAQUE:
2076 case DPSOFTRAST_BLENDMODE_ADD:
2077 case DPSOFTRAST_BLENDMODE_INVMOD:
2078 case DPSOFTRAST_BLENDMODE_MUL:
2079 case DPSOFTRAST_BLENDMODE_MUL2:
2080 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081 case DPSOFTRAST_BLENDMODE_INVADD:
2084 // put some special values at the end of the mask to ensure the loops end
2085 pixelmask[endx] = 1;
2086 pixelmask[endx+1] = 0;
2087 // LordHavoc: use a double loop to identify subspans, this helps the
2088 // optimized copy/blend loops to perform at their best, most triangles
2089 // have only one run of pixels, and do the search using wide reads...
2093 // if this pixel is masked off, it's probably not alone...
2100 // the 4-item search must be aligned or else it stalls badly
2101 if ((x & 3) && !pixelmask[x])
2103 if(pixelmask[x]) goto endmasked;
2107 if(pixelmask[x]) goto endmasked;
2111 if(pixelmask[x]) goto endmasked;
2116 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2120 for (;!pixelmask[x];x++)
2122 // rather than continue the loop, just check the end variable
2127 // find length of subspan
2130 if (subx + 8 < endx)
2134 if(!pixelmask[subx]) goto endunmasked;
2138 if(!pixelmask[subx]) goto endunmasked;
2142 if(!pixelmask[subx]) goto endunmasked;
2147 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2151 for (;pixelmask[subx];subx++)
2153 // the checks can overshoot, so make sure to clip it...
2157 // now that we know the subspan length... process!
2158 switch(thread->fb_blendmode)
2160 case DPSOFTRAST_BLENDMODE_OPAQUE:
2164 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2169 while (x + 16 <= subx)
2171 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2173 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2174 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2179 while (x + 4 <= subx)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187 pixeli[x+1] = ini[x+1];
2197 case DPSOFTRAST_BLENDMODE_ALPHA:
2198 #define FINISHBLEND(blend2, blend1) \
2199 for (;x + 1 < subx;x += 2) \
2202 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2203 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2205 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2210 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2211 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2213 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2217 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2218 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2220 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2221 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2224 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2226 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2227 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2229 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2230 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2233 case DPSOFTRAST_BLENDMODE_ADD:
2234 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2236 case DPSOFTRAST_BLENDMODE_INVMOD:
2238 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2240 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2243 case DPSOFTRAST_BLENDMODE_MUL:
2244 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2246 case DPSOFTRAST_BLENDMODE_MUL2:
2247 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2249 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2251 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2252 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2255 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2258 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2260 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2261 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2263 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2264 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2267 case DPSOFTRAST_BLENDMODE_INVADD:
2269 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2271 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2279 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2282 int startx = span->startx;
2283 int endx = span->endx;
2288 float tc[2], endtc[2];
2290 unsigned int tci[2];
2291 unsigned int tci1[2];
2292 unsigned int tcimin[2];
2293 unsigned int tcimax[2];
2298 const unsigned char * RESTRICT pixelbase;
2299 const unsigned char * RESTRICT pixel[4];
2300 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2301 // if no texture is bound, just fill it with white
2304 for (x = startx;x < endx;x++)
2306 out4f[x*4+0] = 1.0f;
2307 out4f[x*4+1] = 1.0f;
2308 out4f[x*4+2] = 1.0f;
2309 out4f[x*4+3] = 1.0f;
2313 mip = triangle->mip[texunitindex];
2314 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2315 // if this mipmap of the texture is 1 pixel, just fill it with that color
2316 if (texture->mipmap[mip][1] == 4)
2318 c[0] = texture->bytes[2] * (1.0f/255.0f);
2319 c[1] = texture->bytes[1] * (1.0f/255.0f);
2320 c[2] = texture->bytes[0] * (1.0f/255.0f);
2321 c[3] = texture->bytes[3] * (1.0f/255.0f);
2322 for (x = startx;x < endx;x++)
2324 out4f[x*4+0] = c[0];
2325 out4f[x*4+1] = c[1];
2326 out4f[x*4+2] = c[2];
2327 out4f[x*4+3] = c[3];
2331 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2332 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2333 flags = texture->flags;
2334 tcscale[0] = texture->mipmap[mip][2];
2335 tcscale[1] = texture->mipmap[mip][3];
2336 tciwidth = texture->mipmap[mip][2];
2339 tcimax[0] = texture->mipmap[mip][2]-1;
2340 tcimax[1] = texture->mipmap[mip][3]-1;
2341 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2342 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2343 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2344 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2350 for (x = startx;x < endx;)
2352 unsigned int subtc[2];
2353 unsigned int substep[2];
2354 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2355 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2356 if (nextsub >= endx)
2358 nextsub = endsub = endx-1;
2359 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2363 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2364 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2370 substep[0] = (endtc[0] - tc[0]) * subscale;
2371 substep[1] = (endtc[1] - tc[1]) * subscale;
2372 subtc[0] = tc[0] * (1<<12);
2373 subtc[1] = tc[1] * (1<<12);
2376 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2378 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2380 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2381 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2382 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2383 tci[0] = subtc[0]>>12;
2384 tci[1] = subtc[1]>>12;
2385 tci1[0] = tci[0] + 1;
2386 tci1[1] = tci[1] + 1;
2387 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2388 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2389 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2390 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2391 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2392 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2393 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2394 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2395 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2397 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2398 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2399 out4f[x*4+0] = c[0];
2400 out4f[x*4+1] = c[1];
2401 out4f[x*4+2] = c[2];
2402 out4f[x*4+3] = c[3];
2407 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2409 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2410 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2411 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2412 tci[0] = subtc[0]>>12;
2413 tci[1] = subtc[1]>>12;
2414 tci1[0] = tci[0] + 1;
2415 tci1[1] = tci[1] + 1;
2416 tci[0] &= tciwrapmask[0];
2417 tci[1] &= tciwrapmask[1];
2418 tci1[0] &= tciwrapmask[0];
2419 tci1[1] &= tciwrapmask[1];
2420 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2422 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2423 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2424 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2426 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2427 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2428 out4f[x*4+0] = c[0];
2429 out4f[x*4+1] = c[1];
2430 out4f[x*4+2] = c[2];
2431 out4f[x*4+3] = c[3];
2435 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2437 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2439 tci[0] = subtc[0]>>12;
2440 tci[1] = subtc[1]>>12;
2441 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2442 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2443 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2444 c[0] = pixel[0][2] * (1.0f / 255.0f);
2445 c[1] = pixel[0][1] * (1.0f / 255.0f);
2446 c[2] = pixel[0][0] * (1.0f / 255.0f);
2447 c[3] = pixel[0][3] * (1.0f / 255.0f);
2448 out4f[x*4+0] = c[0];
2449 out4f[x*4+1] = c[1];
2450 out4f[x*4+2] = c[2];
2451 out4f[x*4+3] = c[3];
2456 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2458 tci[0] = subtc[0]>>12;
2459 tci[1] = subtc[1]>>12;
2460 tci[0] &= tciwrapmask[0];
2461 tci[1] &= tciwrapmask[1];
2462 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463 c[0] = pixel[0][2] * (1.0f / 255.0f);
2464 c[1] = pixel[0][1] * (1.0f / 255.0f);
2465 c[2] = pixel[0][0] * (1.0f / 255.0f);
2466 c[3] = pixel[0][3] * (1.0f / 255.0f);
2467 out4f[x*4+0] = c[0];
2468 out4f[x*4+1] = c[1];
2469 out4f[x*4+2] = c[2];
2470 out4f[x*4+3] = c[3];
2476 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2480 int startx = span->startx;
2481 int endx = span->endx;
2483 __m128 data, slope, tcscale;
2484 __m128i tcsize, tcmask, tcoffset, tcmax;
2486 __m128i subtc, substep, endsubtc;
2489 int affine; // LordHavoc: optimized affine texturing case
2490 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2491 const unsigned char * RESTRICT pixelbase;
2492 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2493 // if no texture is bound, just fill it with white
2496 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2499 mip = triangle->mip[texunitindex];
2500 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2501 // if this mipmap of the texture is 1 pixel, just fill it with that color
2502 if (texture->mipmap[mip][1] == 4)
2504 unsigned int k = *((const unsigned int *)pixelbase);
2505 for (x = startx;x < endx;x++)
2509 affine = zf[startx] == zf[endx-1];
2510 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2511 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2512 flags = texture->flags;
2513 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2514 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2515 tcscale = _mm_cvtepi32_ps(tcsize);
2516 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2517 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2518 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2520 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2521 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2522 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2523 tcmax = _mm_packs_epi32(tcmask, tcmask);
2524 for (x = startx;x < endx;)
2526 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2527 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2528 if (nextsub >= endx || affine)
2530 nextsub = endsub = endx-1;
2531 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2535 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2537 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2538 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2539 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2540 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2541 substep = _mm_slli_epi32(substep, 1);
2544 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2545 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2547 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2548 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2551 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2552 tci = _mm_madd_epi16(tci, tcoffset);
2553 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2554 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2555 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2556 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2557 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2558 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2559 fracm = _mm_srli_epi16(subtc, 1);
2560 pix1 = _mm_add_epi16(pix1,
2561 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2563 pix3 = _mm_add_epi16(pix3,
2564 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2565 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2566 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2567 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2568 pix2 = _mm_add_epi16(pix2,
2569 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2570 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2571 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2575 const unsigned char * RESTRICT ptr1;
2576 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2577 tci = _mm_madd_epi16(tci, tcoffset);
2578 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2579 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2580 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2581 fracm = _mm_srli_epi16(subtc, 1);
2582 pix1 = _mm_add_epi16(pix1,
2583 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2584 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2585 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2586 pix1 = _mm_add_epi16(pix1,
2587 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2589 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2593 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2595 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2597 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2598 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2599 tci = _mm_madd_epi16(tci, tcoffset);
2600 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602 _mm_setzero_si128());
2603 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605 _mm_setzero_si128());
2606 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2607 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2608 tci = _mm_madd_epi16(tci, tcoffset);
2609 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2610 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2611 _mm_setzero_si128());
2612 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2613 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2614 _mm_setzero_si128());
2615 fracm = _mm_srli_epi16(subtc, 1);
2616 pix1 = _mm_add_epi16(pix1,
2617 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2619 pix3 = _mm_add_epi16(pix3,
2620 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2621 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2622 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2623 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2624 pix2 = _mm_add_epi16(pix2,
2625 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2626 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2627 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2631 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2632 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2633 tci = _mm_madd_epi16(tci, tcoffset);
2634 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2635 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2636 _mm_setzero_si128());
2637 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2638 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2639 _mm_setzero_si128());
2640 fracm = _mm_srli_epi16(subtc, 1);
2641 pix1 = _mm_add_epi16(pix1,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2645 pix1 = _mm_add_epi16(pix1,
2646 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2647 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2648 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2654 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2656 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2657 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661 _mm_setzero_si128());
2662 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664 _mm_setzero_si128());
2665 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2666 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2667 tci = _mm_madd_epi16(tci, tcoffset);
2668 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670 _mm_setzero_si128());
2671 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673 _mm_setzero_si128());
2674 fracm = _mm_srli_epi16(subtc, 1);
2675 pix1 = _mm_add_epi16(pix1,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2678 pix3 = _mm_add_epi16(pix3,
2679 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2680 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2681 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2682 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2683 pix2 = _mm_add_epi16(pix2,
2684 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2685 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2686 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2690 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2691 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2692 tci = _mm_madd_epi16(tci, tcoffset);
2693 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2694 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2695 _mm_setzero_si128());
2696 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2697 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2698 _mm_setzero_si128());
2699 fracm = _mm_srli_epi16(subtc, 1);
2700 pix1 = _mm_add_epi16(pix1,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704 pix1 = _mm_add_epi16(pix1,
2705 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2714 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2716 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2718 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2719 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2720 tci = _mm_madd_epi16(tci, tcoffset);
2721 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2722 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2726 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2727 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2728 tci = _mm_madd_epi16(tci, tcoffset);
2729 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2735 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2738 tci = _mm_and_si128(tci, tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2745 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2746 tci = _mm_and_si128(tci, tcmax);
2747 tci = _mm_madd_epi16(tci, tcoffset);
2748 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2757 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2760 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2763 float DPSOFTRAST_SampleShadowmap(const float *vector)
2769 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2772 int startx = span->startx;
2773 int endx = span->endx;
2778 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2779 for (x = startx;x < endx;x++)
2782 c[0] = (data[0] + slope[0]*x) * z;
2783 c[1] = (data[1] + slope[1]*x) * z;
2784 c[2] = (data[2] + slope[2]*x) * z;
2785 c[3] = (data[3] + slope[3]*x) * z;
2786 out4f[x*4+0] = in4f[x*4+0] * c[0];
2787 out4f[x*4+1] = in4f[x*4+1] * c[1];
2788 out4f[x*4+2] = in4f[x*4+2] * c[2];
2789 out4f[x*4+3] = in4f[x*4+3] * c[3];
2793 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2796 int startx = span->startx;
2797 int endx = span->endx;
2802 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2803 for (x = startx;x < endx;x++)
2806 c[0] = (data[0] + slope[0]*x) * z;
2807 c[1] = (data[1] + slope[1]*x) * z;
2808 c[2] = (data[2] + slope[2]*x) * z;
2809 c[3] = (data[3] + slope[3]*x) * z;
2810 out4f[x*4+0] = c[0];
2811 out4f[x*4+1] = c[1];
2812 out4f[x*4+2] = c[2];
2813 out4f[x*4+3] = c[3];
2817 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2819 int x, startx = span->startx, endx = span->endx;
2820 float c[4], localcolor[4];
2821 localcolor[0] = subcolor[0];
2822 localcolor[1] = subcolor[1];
2823 localcolor[2] = subcolor[2];
2824 localcolor[3] = subcolor[3];
2825 for (x = startx;x < endx;x++)
2827 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2828 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2829 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2830 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2831 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2832 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2833 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2834 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2840 int x, startx = span->startx, endx = span->endx;
2841 for (x = startx;x < endx;x++)
2843 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2844 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2845 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2846 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2850 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2852 int x, startx = span->startx, endx = span->endx;
2853 for (x = startx;x < endx;x++)
2855 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2856 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2857 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2858 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2862 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2864 int x, startx = span->startx, endx = span->endx;
2866 for (x = startx;x < endx;x++)
2868 a = 1.0f - inb4f[x*4+3];
2870 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2871 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2872 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2873 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2877 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2879 int x, startx = span->startx, endx = span->endx;
2880 float localcolor[4], ilerp, lerp;
2881 localcolor[0] = color[0];
2882 localcolor[1] = color[1];
2883 localcolor[2] = color[2];
2884 localcolor[3] = color[3];
2885 ilerp = 1.0f - localcolor[3];
2886 lerp = localcolor[3];
2887 for (x = startx;x < endx;x++)
2889 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2890 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2891 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2892 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2898 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2902 int startx = span->startx;
2903 int endx = span->endx;
2906 __m128i submod, substep, endsubmod;
2907 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2908 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2909 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2910 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2911 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912 for (x = startx; x < endx;)
2914 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2915 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2916 if (nextsub >= endx)
2918 nextsub = endsub = endx-1;
2919 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2923 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2924 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2925 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2927 substep = _mm_packs_epi32(substep, substep);
2928 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2931 pix = _mm_mulhi_epu16(pix, submod);
2932 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2936 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2937 pix = _mm_mulhi_epu16(pix, submod);
2938 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2945 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2949 int startx = span->startx;
2950 int endx = span->endx;
2953 __m128i submod, substep, endsubmod;
2954 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2955 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2956 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2957 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2958 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959 for (x = startx; x < endx;)
2961 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2962 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2963 if (nextsub >= endx)
2965 nextsub = endsub = endx-1;
2966 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2970 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2971 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2972 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2974 substep = _mm_packs_epi32(substep, substep);
2975 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2977 __m128i pix = _mm_srai_epi16(submod, 4);
2978 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2982 __m128i pix = _mm_srai_epi16(submod, 4);
2983 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2990 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2993 int x, startx = span->startx, endx = span->endx;
2994 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2995 localcolor = _mm_packs_epi32(localcolor, localcolor);
2996 for (x = startx;x+2 <= endx;x+=2)
2998 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2999 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3000 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3001 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3005 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3006 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3007 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3008 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3013 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3016 int x, startx = span->startx, endx = span->endx;
3017 for (x = startx;x+2 <= endx;x+=2)
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3026 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028 pix1 = _mm_mulhi_epu16(pix1, pix2);
3029 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3034 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3037 int x, startx = span->startx, endx = span->endx;
3038 for (x = startx;x+2 <= endx;x+=2)
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042 pix1 = _mm_add_epi16(pix1, pix2);
3043 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3047 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3049 pix1 = _mm_add_epi16(pix1, pix2);
3050 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3055 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3058 int x, startx = span->startx, endx = span->endx;
3059 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3060 tint = _mm_packs_epi32(tint, tint);
3061 for (x = startx;x+2 <= endx;x+=2)
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3073 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3078 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3081 int x, startx = span->startx, endx = span->endx;
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3087 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3092 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3095 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3096 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3101 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3104 int x, startx = span->startx, endx = span->endx;
3105 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3106 localcolor = _mm_packs_epi32(localcolor, localcolor);
3107 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3108 for (x = startx;x+2 <= endx;x+=2)
3110 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3111 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3116 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3117 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3125 void DPSOFTRAST_VertexShader_Generic(void)
3127 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3128 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3131 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3134 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3143 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3144 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3145 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3147 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3148 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3151 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3156 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3161 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3166 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3167 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3172 void DPSOFTRAST_VertexShader_PostProcess(void)
3174 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3175 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3176 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3179 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3181 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3182 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3183 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3186 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3187 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3189 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3190 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3192 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3193 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3195 // TODO: implement saturation
3197 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3199 // TODO: implement gammaramps
3201 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3206 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3208 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3211 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 // this is never called (because colormask is off when this shader is used)
3214 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3215 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3216 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3217 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3218 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3223 void DPSOFTRAST_VertexShader_FlatColor(void)
3225 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3229 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3232 unsigned char * RESTRICT pixelmask = span->pixelmask;
3233 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3234 int x, startx = span->startx, endx = span->endx;
3235 __m128i Color_Ambientm;
3236 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3237 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3241 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3242 pixel = buffer_FragColorbgra8;
3243 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3244 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3245 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3246 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3247 for (x = startx;x < endx;x++)
3250 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3253 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3255 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3256 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3262 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3263 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3264 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266 if (pixel == buffer_FragColorbgra8)
3267 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3273 void DPSOFTRAST_VertexShader_VertexColor(void)
3275 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3276 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3277 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3280 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 unsigned char * RESTRICT pixelmask = span->pixelmask;
3284 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285 int x, startx = span->startx, endx = span->endx;
3286 __m128i Color_Ambientm, Color_Diffusem;
3288 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3289 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3292 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3293 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3294 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3295 pixel = buffer_FragColorbgra8;
3296 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3297 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3298 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3299 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3300 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3301 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3302 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3303 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3304 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3305 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3306 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3307 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3308 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3309 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3311 __m128i color, mod, pix;
3312 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3315 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3316 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3317 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3318 data = _mm_add_ps(data, slope);
3319 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3320 data = _mm_add_ps(data, slope);
3321 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3322 data = _mm_add_ps(data, slope);
3323 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3324 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3325 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3326 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3327 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3328 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3334 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3335 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3336 mod = _mm_packs_epi32(mod, mod);
3337 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3338 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3340 if (pixel == buffer_FragColorbgra8)
3341 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3347 void DPSOFTRAST_VertexShader_Lightmap(void)
3349 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3350 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3351 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3354 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3357 unsigned char * RESTRICT pixelmask = span->pixelmask;
3358 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3359 int x, startx = span->startx, endx = span->endx;
3360 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3361 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3367 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3369 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3370 pixel = buffer_FragColorbgra8;
3371 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3372 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3373 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3374 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3375 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3378 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3380 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3381 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3384 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3385 for (x = startx;x < endx;x++)
3387 __m128i color, lightmap, glow, pix;
3388 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3391 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3393 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3394 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3395 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3396 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3397 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3398 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3399 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3400 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3406 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3407 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3408 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3409 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3410 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3411 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3416 for (x = startx;x < endx;x++)
3418 __m128i color, lightmap, pix;
3419 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3422 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3423 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3424 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3425 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3427 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3434 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3436 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3437 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3440 if (pixel == buffer_FragColorbgra8)
3441 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3446 void DPSOFTRAST_VertexShader_LightDirection(void);
3447 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3449 void DPSOFTRAST_VertexShader_FakeLight(void)
3451 DPSOFTRAST_VertexShader_LightDirection();
3454 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3461 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3463 DPSOFTRAST_VertexShader_LightDirection();
3464 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3467 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3469 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3474 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3476 DPSOFTRAST_VertexShader_LightDirection();
3477 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3480 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3482 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3487 void DPSOFTRAST_VertexShader_LightDirection(void)
3490 int numvertices = dpsoftrast.numvertices;
3492 float LightVector[4];
3493 float EyePosition[4];
3494 float EyeVectorModelSpace[4];
3500 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3501 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3502 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3503 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3504 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3505 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3506 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3507 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3508 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3509 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3510 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3511 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3512 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3513 for (i = 0;i < numvertices;i++)
3515 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3516 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3517 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3518 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3519 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3520 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3521 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3522 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3523 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3524 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3525 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3526 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3527 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3528 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3529 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3530 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3531 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3532 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3533 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3534 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3535 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3536 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3537 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3538 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3539 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3540 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3541 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3542 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3545 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3548 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3549 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3550 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3551 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3552 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3553 #define DPSOFTRAST_Vector3Normalize(v)\
3556 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3567 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3569 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3570 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 int x, startx = span->startx, endx = span->endx;
3580 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3581 float LightVectordata[4];
3582 float LightVectorslope[4];
3583 float EyeVectordata[4];
3584 float EyeVectorslope[4];
3585 float VectorSdata[4];
3586 float VectorSslope[4];
3587 float VectorTdata[4];
3588 float VectorTslope[4];
3589 float VectorRdata[4];
3590 float VectorRslope[4];
3592 float diffusetex[4];
3594 float surfacenormal[4];
3595 float lightnormal[4];
3596 float lightnormal_modelspace[4];
3598 float specularnormal[4];
3601 float SpecularPower;
3603 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3604 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3605 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3606 Color_Glow[3] = 0.0f;
3607 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3608 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3609 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3610 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3611 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3612 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3613 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3614 Color_Pants[3] = 0.0f;
3615 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3616 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3617 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3618 Color_Shirt[3] = 0.0f;
3619 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3620 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3621 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3623 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3624 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3626 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3628 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3632 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3633 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3634 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3635 Color_Diffuse[3] = 0.0f;
3636 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3637 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3638 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3639 LightColor[3] = 0.0f;
3640 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3641 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3642 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3643 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3644 Color_Specular[3] = 0.0f;
3645 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3646 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3647 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3649 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3651 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3652 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3653 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3654 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3655 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3657 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3659 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3660 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3662 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3664 // nothing of this needed
3668 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3671 for (x = startx;x < endx;x++)
3674 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3675 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3676 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3677 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3678 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3680 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3681 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3682 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3683 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3685 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3686 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3687 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3688 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3689 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3690 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3691 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3692 DPSOFTRAST_Vector3Normalize(surfacenormal);
3694 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3696 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3697 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3698 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3699 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3701 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3702 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3703 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3704 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3706 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3707 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3708 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3709 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3711 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3712 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3713 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3714 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3716 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3717 DPSOFTRAST_Vector3Normalize(lightnormal);
3719 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3721 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3722 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3723 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3724 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3727 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3729 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3730 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3731 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3733 float f = 1.0f / 256.0f;
3734 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3735 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3736 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3739 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3741 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3742 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3743 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3744 DPSOFTRAST_Vector3Normalize(lightnormal);
3746 LightColor[0] = 1.0;
3747 LightColor[1] = 1.0;
3748 LightColor[2] = 1.0;
3752 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3753 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3754 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3755 DPSOFTRAST_Vector3Normalize(lightnormal);
3758 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3760 if(thread->shader_exactspecularmath)
3762 // reflect lightnormal at surfacenormal, take the negative of that
3763 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3765 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3766 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3767 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3768 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3770 // dot of this and normalize(EyeVectorFogDepth.xyz)
3771 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3772 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3773 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3774 DPSOFTRAST_Vector3Normalize(eyenormal);
3776 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3780 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3781 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3782 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3783 DPSOFTRAST_Vector3Normalize(eyenormal);
3785 specularnormal[0] = lightnormal[0] + eyenormal[0];
3786 specularnormal[1] = lightnormal[1] + eyenormal[1];
3787 specularnormal[2] = lightnormal[2] + eyenormal[2];
3788 DPSOFTRAST_Vector3Normalize(specularnormal);
3790 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3793 specular = pow(specular, SpecularPower * glosstex[3]);
3794 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3796 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3797 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3798 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3799 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3803 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3804 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3805 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3806 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3809 buffer_FragColorbgra8[x*4+0] = d[0];
3810 buffer_FragColorbgra8[x*4+1] = d[1];
3811 buffer_FragColorbgra8[x*4+2] = d[2];
3812 buffer_FragColorbgra8[x*4+3] = d[3];
3815 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3817 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3818 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3819 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3820 Color_Diffuse[3] = 0.0f;
3821 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3822 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3823 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3824 LightColor[3] = 0.0f;
3825 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3827 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3829 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3830 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3831 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3832 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3833 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3835 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3837 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3838 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3840 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3842 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3846 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3849 for (x = startx;x < endx;x++)
3852 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3853 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3854 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3855 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3856 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3857 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3858 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3859 DPSOFTRAST_Vector3Normalize(surfacenormal);
3861 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3863 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3864 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3865 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3866 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3868 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3869 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3870 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3871 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3873 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3874 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3875 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3876 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3878 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3879 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3880 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3881 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3883 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3884 DPSOFTRAST_Vector3Normalize(lightnormal);
3886 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3888 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3889 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3890 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3891 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3894 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3896 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3897 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3898 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3900 float f = 1.0f / 256.0f;
3901 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3902 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3903 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3906 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3908 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3909 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3910 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3911 DPSOFTRAST_Vector3Normalize(lightnormal);
3913 LightColor[0] = 1.0;
3914 LightColor[1] = 1.0;
3915 LightColor[2] = 1.0;
3919 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3920 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3921 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3922 DPSOFTRAST_Vector3Normalize(lightnormal);
3925 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3926 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3928 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3929 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3930 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3931 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3935 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3936 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3937 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3938 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3940 buffer_FragColorbgra8[x*4+0] = d[0];
3941 buffer_FragColorbgra8[x*4+1] = d[1];
3942 buffer_FragColorbgra8[x*4+2] = d[2];
3943 buffer_FragColorbgra8[x*4+3] = d[3];
3948 for (x = startx;x < endx;x++)
3951 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3956 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3958 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3959 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3960 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3961 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3965 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3966 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3967 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3968 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3970 buffer_FragColorbgra8[x*4+0] = d[0];
3971 buffer_FragColorbgra8[x*4+1] = d[1];
3972 buffer_FragColorbgra8[x*4+2] = d[2];
3973 buffer_FragColorbgra8[x*4+3] = d[3];
3976 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3981 void DPSOFTRAST_VertexShader_LightSource(void)
3984 int numvertices = dpsoftrast.numvertices;
3985 float LightPosition[4];
3986 float LightVector[4];
3987 float LightVectorModelSpace[4];
3988 float EyePosition[4];
3989 float EyeVectorModelSpace[4];
3995 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3996 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3997 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3998 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3999 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4000 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4001 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4002 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4003 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4004 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4005 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4006 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4007 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4008 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4009 for (i = 0;i < numvertices;i++)
4011 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4012 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4013 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4014 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4015 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4016 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4017 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4018 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4019 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4020 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4021 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4022 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4023 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4024 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4025 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4026 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4027 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4028 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4029 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4030 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4031 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4032 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4033 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4034 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4035 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4036 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4037 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4038 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4039 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4040 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4041 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4042 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4044 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4045 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4048 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4051 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4052 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4053 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4054 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4055 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4056 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4057 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4058 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059 int x, startx = span->startx, endx = span->endx;
4060 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4061 float CubeVectordata[4];
4062 float CubeVectorslope[4];
4063 float LightVectordata[4];
4064 float LightVectorslope[4];
4065 float EyeVectordata[4];
4066 float EyeVectorslope[4];
4068 float diffusetex[4];
4070 float surfacenormal[4];
4071 float lightnormal[4];
4073 float specularnormal[4];
4076 float SpecularPower;
4077 float CubeVector[4];
4080 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4081 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4082 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4083 Color_Glow[3] = 0.0f;
4084 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4085 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4086 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4087 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4088 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4089 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4090 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4091 Color_Diffuse[3] = 0.0f;
4092 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4093 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4094 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4095 Color_Specular[3] = 0.0f;
4096 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4097 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4098 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4099 Color_Pants[3] = 0.0f;
4100 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4101 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4102 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4103 Color_Shirt[3] = 0.0f;
4104 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4105 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4106 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4107 LightColor[3] = 0.0f;
4108 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4109 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4110 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4111 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4112 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4113 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4114 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4115 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4117 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4118 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4120 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4121 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4122 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4124 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4125 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4126 for (x = startx;x < endx;x++)
4129 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4130 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4131 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4132 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4133 if (attenuation < 0.01f)
4135 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4137 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4138 if (attenuation < 0.01f)
4142 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4143 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4144 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4145 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4146 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4148 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4149 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4150 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4151 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4153 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4154 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4155 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4156 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4157 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4158 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4159 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4160 DPSOFTRAST_Vector3Normalize(surfacenormal);
4162 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4163 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4164 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4165 DPSOFTRAST_Vector3Normalize(lightnormal);
4167 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4169 if(thread->shader_exactspecularmath)
4171 // reflect lightnormal at surfacenormal, take the negative of that
4172 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4174 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4175 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4176 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4177 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4179 // dot of this and normalize(EyeVectorFogDepth.xyz)
4180 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4181 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4182 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4183 DPSOFTRAST_Vector3Normalize(eyenormal);
4185 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4189 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4190 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4191 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4192 DPSOFTRAST_Vector3Normalize(eyenormal);
4194 specularnormal[0] = lightnormal[0] + eyenormal[0];
4195 specularnormal[1] = lightnormal[1] + eyenormal[1];
4196 specularnormal[2] = lightnormal[2] + eyenormal[2];
4197 DPSOFTRAST_Vector3Normalize(specularnormal);
4199 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4201 specular = pow(specular, SpecularPower * glosstex[3]);
4203 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4205 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4206 attenuation *= (1.0f / 255.0f);
4207 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4208 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4209 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4210 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4214 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4215 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4216 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4217 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4219 buffer_FragColorbgra8[x*4+0] = d[0];
4220 buffer_FragColorbgra8[x*4+1] = d[1];
4221 buffer_FragColorbgra8[x*4+2] = d[2];
4222 buffer_FragColorbgra8[x*4+3] = d[3];
4225 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4227 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4228 for (x = startx;x < endx;x++)
4231 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4232 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4233 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4234 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4235 if (attenuation < 0.01f)
4237 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4239 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4240 if (attenuation < 0.01f)
4244 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4245 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4246 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4247 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4248 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4250 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4251 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4252 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4253 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4255 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4256 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4257 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4258 DPSOFTRAST_Vector3Normalize(surfacenormal);
4260 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4261 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4262 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4263 DPSOFTRAST_Vector3Normalize(lightnormal);
4265 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4266 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4268 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4269 attenuation *= (1.0f / 255.0f);
4270 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4271 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4272 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4273 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4277 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4278 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4279 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4280 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4282 buffer_FragColorbgra8[x*4+0] = d[0];
4283 buffer_FragColorbgra8[x*4+1] = d[1];
4284 buffer_FragColorbgra8[x*4+2] = d[2];
4285 buffer_FragColorbgra8[x*4+3] = d[3];
4290 for (x = startx;x < endx;x++)
4293 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4294 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4295 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4296 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4297 if (attenuation < 0.01f)
4299 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4301 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4302 if (attenuation < 0.01f)
4306 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4307 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4308 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4309 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4310 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4312 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4313 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4314 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4315 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4317 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4319 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4320 attenuation *= (1.0f / 255.0f);
4321 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4322 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4323 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4324 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4328 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4329 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4330 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4331 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4333 buffer_FragColorbgra8[x*4+0] = d[0];
4334 buffer_FragColorbgra8[x*4+1] = d[1];
4335 buffer_FragColorbgra8[x*4+2] = d[2];
4336 buffer_FragColorbgra8[x*4+3] = d[3];
4339 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4345 void DPSOFTRAST_VertexShader_Refraction(void)
4347 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4348 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4349 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4352 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4354 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4356 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4358 int x, startx = span->startx, endx = span->endx;
4361 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4362 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4365 float ModelViewProjectionPositiondata[4];
4366 float ModelViewProjectionPositionslope[4];
4369 float ScreenScaleRefractReflect[2];
4370 float ScreenCenterRefractReflect[2];
4371 float DistortScaleRefractReflect[2];
4372 float RefractColor[4];
4374 const unsigned char * RESTRICT pixelbase;
4375 const unsigned char * RESTRICT pixel[4];
4376 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4377 if(!texture) return;
4378 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4381 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4382 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4385 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4388 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4389 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4390 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4391 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4392 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4393 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4394 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4395 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4396 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4397 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4400 for (x = startx;x < endx;x++)
4402 float SafeScreenTexCoord[2];
4403 float ScreenTexCoord[2];
4410 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4411 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4413 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4414 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4415 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4417 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4418 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4419 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4420 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4421 DPSOFTRAST_Vector3Normalize(v);
4422 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4423 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4425 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4426 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4428 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4429 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4430 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4431 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4432 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4433 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4434 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4435 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4436 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4437 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4438 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4439 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4440 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4441 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4442 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4443 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4444 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4448 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4449 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4450 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4451 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4457 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4458 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4459 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4460 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4461 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4464 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4469 void DPSOFTRAST_VertexShader_Water(void)
4471 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4475 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4478 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4479 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4480 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4481 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4482 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4487 void DPSOFTRAST_VertexShader_ShowDepth(void)
4489 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4492 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4495 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4496 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4497 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4498 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4499 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4504 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4506 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4509 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4512 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4513 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4514 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4515 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4516 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4521 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4523 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4526 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4529 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4538 typedef struct DPSOFTRAST_ShaderModeInfo_s
4541 void (*Vertex)(void);
4542 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4543 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4544 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4546 DPSOFTRAST_ShaderModeInfo;
4548 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4550 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4551 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4552 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4553 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4554 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4555 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4556 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4557 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4558 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4559 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4560 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4561 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4562 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4563 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4564 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4565 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4568 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4573 unsigned int *depthpixel;
4577 unsigned char *pixelmask;
4578 DPSOFTRAST_State_Triangle *triangle;
4579 triangle = &thread->triangles[span->triangle];
4580 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4581 startx = span->startx;
4583 depth = span->depthbase;
4584 depthslope = span->depthslope;
4585 pixelmask = thread->pixelmaskarray;
4586 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4588 switch(thread->fb_depthfunc)
4591 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4592 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4593 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4594 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4595 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4596 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4597 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4599 while (startx < endx && !pixelmask[startx])
4601 while (endx > startx && !pixelmask[endx-1])
4606 // no depth testing means we're just dealing with color...
4607 memset(pixelmask + startx, 1, endx - startx);
4609 span->pixelmask = pixelmask;
4610 span->startx = startx;
4614 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4616 int x, d, depth, depthslope, startx, endx;
4617 const unsigned char *pixelmask;
4618 unsigned int *depthpixel;
4619 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4621 depth = span->depthbase;
4622 depthslope = span->depthslope;
4623 pixelmask = span->pixelmask;
4624 startx = span->startx;
4626 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4627 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4633 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4636 DPSOFTRAST_State_Triangle *triangle;
4637 DPSOFTRAST_State_Span *span;
4638 for (i = 0; i < thread->numspans; i++)
4640 span = &thread->spans[i];
4641 triangle = &thread->triangles[span->triangle];
4642 DPSOFTRAST_Draw_DepthTest(thread, span);
4643 if (span->startx >= span->endx)
4645 // run pixel shader if appropriate
4646 // do this before running depthmask code, to allow the pixelshader
4647 // to clear pixelmask values for alpha testing
4648 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4649 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4650 DPSOFTRAST_Draw_DepthWrite(thread, span);
4652 thread->numspans = 0;
4655 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4657 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4660 int cullface = thread->cullface;
4661 int minx, maxx, miny, maxy;
4662 int miny1, maxy1, miny2, maxy2;
4663 __m128i fbmin, fbmax;
4664 __m128 viewportcenter, viewportscale;
4665 int firstvertex = command->firstvertex;
4666 int numvertices = command->numvertices;
4667 int numtriangles = command->numtriangles;
4668 const int *element3i = command->element3i;
4669 const unsigned short *element3s = command->element3s;
4670 int clipped = command->clipped;
4677 int starty, endy, bandy;
4681 float clip0origin, clip0slope;
4683 __m128 triangleedge1, triangleedge2, trianglenormal;
4686 DPSOFTRAST_State_Triangle *triangle;
4687 DPSOFTRAST_Texture *texture;
4688 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4689 miny = thread->fb_scissor[1];
4690 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4691 miny1 = bound(miny, thread->miny1, maxy);
4692 maxy1 = bound(miny, thread->maxy1, maxy);
4693 miny2 = bound(miny, thread->miny2, maxy);
4694 maxy2 = bound(miny, thread->maxy2, maxy);
4695 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4697 if (!ATOMIC_DECREMENT(command->refcount))
4699 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4700 MM_FREE(command->arrays);
4704 minx = thread->fb_scissor[0];
4705 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4706 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4707 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4708 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4709 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4710 screen[3] = _mm_setzero_ps();
4711 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4712 for (i = 0;i < numtriangles;i++)
4714 const float *screencoord4f = command->arrays;
4715 const float *arrays = screencoord4f + numvertices*4;
4717 // generate the 3 edges of this triangle
4718 // generate spans for the triangle - switch based on left split or right split classification of triangle
4721 e[0] = element3s[i*3+0] - firstvertex;
4722 e[1] = element3s[i*3+1] - firstvertex;
4723 e[2] = element3s[i*3+2] - firstvertex;
4727 e[0] = element3i[i*3+0] - firstvertex;
4728 e[1] = element3i[i*3+1] - firstvertex;
4729 e[2] = element3i[i*3+2] - firstvertex;
4738 #define SKIPBACKFACE \
4739 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4740 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4741 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4742 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4743 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4747 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4751 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4756 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4757 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4759 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4760 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4762 #define CLIPPEDVERTEXCOPY(k,p1) \
4763 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4765 #define GENATTRIBCOPY(attrib, p1) \
4766 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4767 #define GENATTRIBLERP(attrib, p1, p2) \
4769 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4770 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4772 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4776 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4777 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4778 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4779 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4780 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4781 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4782 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4788 // calculate distance from nearplane
4789 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4790 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4791 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4792 if (clipdist[0] >= 0.0f)
4794 if (clipdist[1] >= 0.0f)
4796 if (clipdist[2] >= 0.0f)
4799 // triangle is entirely in front of nearplane
4800 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4807 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4815 if (clipdist[2] >= 0.0f)
4817 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4824 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4831 else if (clipdist[1] >= 0.0f)
4833 if (clipdist[2] >= 0.0f)
4835 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4842 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4848 else if (clipdist[2] >= 0.0f)
4850 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4855 else continue; // triangle is entirely behind nearplane
4858 // calculate integer y coords for triangle points
4859 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4860 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4861 screenmin = _mm_min_epi16(screeni, screenir),
4862 screenmax = _mm_max_epi16(screeni, screenir);
4863 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4864 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4865 screenmin = _mm_max_epi16(screenmin, fbmin);
4866 screenmax = _mm_min_epi16(screenmax, fbmax);
4867 // skip offscreen triangles
4868 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4870 starty = _mm_extract_epi16(screenmin, 1);
4871 endy = _mm_extract_epi16(screenmax, 1)+1;
4872 if (starty >= maxy1 && endy <= miny2)
4874 screeny = _mm_srai_epi32(screeni, 16);
4877 triangle = &thread->triangles[thread->numtriangles];
4879 // calculate attribute plans for triangle data...
4880 // okay, this triangle is going to produce spans, we'd better project
4881 // the interpolants now (this is what gives perspective texturing),
4882 // this consists of simply multiplying all arrays by the W coord
4883 // (which is basically 1/Z), which will be undone per-pixel
4884 // (multiplying by Z again) to get the perspective-correct array
4887 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4888 __m128 mipedgescale, mipdensity;
4889 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4890 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4891 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4892 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4893 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4894 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4895 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4896 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4897 attribedge1 = _mm_sub_ss(w0, w1);
4898 attribedge2 = _mm_sub_ss(w2, w1);
4899 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4900 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4901 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4902 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4903 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4904 _mm_store_ss(&triangle->w[0], attribxslope);
4905 _mm_store_ss(&triangle->w[1], attribyslope);
4906 _mm_store_ss(&triangle->w[2], attriborigin);
4911 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4913 float cliporigin, clipxslope, clipyslope;
4914 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4915 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4916 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4917 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4918 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4919 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4920 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4921 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4922 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4925 clip0origin = -cliporigin/clipxslope;
4926 clip0slope = -clipyslope/clipxslope;
4927 clip0dir = clipxslope > 0 ? 1 : -1;
4929 else if(clipyslope > 0)
4931 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4932 clip0slope = dpsoftrast.fb_width;
4935 else if(clipyslope < 0)
4937 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4938 clip0slope = -dpsoftrast.fb_width;
4941 else if(clip0origin < 0) continue;
4944 mipedgescale = _mm_setzero_ps();
4945 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4947 __m128 attrib0, attrib1, attrib2;
4948 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4949 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4951 arrays += numvertices*4;
4952 GENATTRIBS(attrib0, attrib1, attrib2);
4953 attriborigin = _mm_mul_ps(attrib1, w1);
4954 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4955 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4956 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4957 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4958 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4959 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4960 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4961 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4962 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4964 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4965 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4966 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4967 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4971 memset(triangle->mip, 0, sizeof(triangle->mip));
4972 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4974 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4975 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4977 texture = thread->texbound[texunit];
4978 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4980 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4981 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4982 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4983 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4984 // this will be multiplied in the texturing routine by the texture resolution
4985 y = _mm_cvtss_si32(mipdensity);
4988 y = (int)(log((float)y)*0.5f/M_LN2);
4989 if (y > texture->mipmaps - 1)
4990 y = texture->mipmaps - 1;
4991 triangle->mip[texunit] = y;
4997 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5000 __m128 xcoords, xslope;
5001 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5002 int yccmask = _mm_movemask_epi8(ycc);
5003 int edge0p, edge0n, edge1p, edge1n;
5012 case 0xFFFF: /*0000*/ y = endy; continue;
5013 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5014 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5015 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5016 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5017 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5018 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5019 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5020 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5021 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5022 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5023 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5024 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5025 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5026 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5027 case 0x0000: /*1111*/ y++; continue;
5035 case 0xFFFF: /*000*/ y = endy; continue;
5036 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5037 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5038 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5039 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5040 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5041 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5042 case 0x0000: /*111*/ y++; continue;
5045 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5046 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5047 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5048 nexty = _mm_extract_epi16(ycc, 0);
5049 if (nexty >= bandy) nexty = bandy-1;
5050 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5051 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5052 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5053 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5054 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5055 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5057 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5058 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5060 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5061 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5063 int startx, endx, offset;
5064 startx = _mm_cvtss_si32(xcoords);
5065 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5066 if (startx < minx) startx = minx;
5067 if (endx > maxx) endx = maxx;
5068 if (startx >= endx) continue;
5076 if(endx <= clip0) continue;
5077 startx = (int)clip0;
5080 else if (endx > clip0)
5082 if(startx >= clip0) continue;
5087 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5089 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5090 span->triangle = thread->numtriangles;
5094 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5095 if (span->startx >= span->endx)
5097 wslope = triangle->w[0];
5098 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5099 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5100 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5101 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5102 DPSOFTRAST_Draw_ProcessSpans(thread);
5107 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5109 DPSOFTRAST_Draw_ProcessSpans(thread);
5110 thread->numtriangles = 0;
5114 if (!ATOMIC_DECREMENT(command->refcount))
5116 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5117 MM_FREE(command->arrays);
5120 if (thread->numspans > 0 || thread->numtriangles > 0)
5122 DPSOFTRAST_Draw_ProcessSpans(thread);
5123 thread->numtriangles = 0;
5128 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5132 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5133 int datasize = 2*numvertices*sizeof(float[4]);
5134 DPSOFTRAST_Command_Draw *command;
5135 unsigned char *data;
5136 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5138 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5139 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5141 datasize += numvertices*sizeof(float[4]);
5144 datasize += numtriangles*sizeof(unsigned short[3]);
5146 datasize += numtriangles*sizeof(int[3]);
5147 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5148 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5150 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5151 data = (unsigned char *)MM_CALLOC(datasize, 1);
5155 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5156 data = (unsigned char *)command + commandsize;
5158 command->firstvertex = firstvertex;
5159 command->numvertices = numvertices;
5160 command->numtriangles = numtriangles;
5161 command->arrays = (float *)data;
5162 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5163 dpsoftrast.firstvertex = firstvertex;
5164 dpsoftrast.numvertices = numvertices;
5165 dpsoftrast.screencoord4f = (float *)data;
5166 data += numvertices*sizeof(float[4]);
5167 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5168 data += numvertices*sizeof(float[4]);
5169 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5171 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5172 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5174 dpsoftrast.post_array4f[j] = (float *)data;
5175 data += numvertices*sizeof(float[4]);
5177 command->element3i = NULL;
5178 command->element3s = NULL;
5181 command->element3s = (unsigned short *)data;
5182 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5186 command->element3i = (int *)data;
5187 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5192 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5194 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5195 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5196 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5197 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5198 if (command->starty >= command->endy)
5200 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5201 MM_FREE(command->arrays);
5202 DPSOFTRAST_UndoCommand(command->commandsize);
5205 command->clipped = dpsoftrast.drawclipped;
5206 command->refcount = dpsoftrast.numthreads;
5208 if (dpsoftrast.usethreads)
5211 DPSOFTRAST_Draw_SyncCommands();
5212 for (i = 0; i < dpsoftrast.numthreads; i++)
5214 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5215 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5216 Thread_CondSignal(thread->drawcond);
5221 DPSOFTRAST_Draw_FlushThreads();
5225 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5226 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5228 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5230 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5232 DPSOFTRAST_Command_SetRenderTargets *command;
5233 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5234 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5235 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5237 dpsoftrast.fb_width = width;
5238 dpsoftrast.fb_height = height;
5239 dpsoftrast.fb_depthpixels = depthpixels;
5240 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5241 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5242 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5243 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5244 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5245 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5246 command->width = width;
5247 command->height = height;
5250 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5252 int commandoffset = thread->commandoffset;
5253 while (commandoffset != endoffset)
5255 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5256 switch (command->opcode)
5258 #define INTERPCOMMAND(name) \
5259 case DPSOFTRAST_OPCODE_##name : \
5260 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5261 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5262 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5263 commandoffset = 0; \
5265 INTERPCOMMAND(Viewport)
5266 INTERPCOMMAND(ClearColor)
5267 INTERPCOMMAND(ClearDepth)
5268 INTERPCOMMAND(ColorMask)
5269 INTERPCOMMAND(DepthTest)
5270 INTERPCOMMAND(ScissorTest)
5271 INTERPCOMMAND(Scissor)
5272 INTERPCOMMAND(BlendFunc)
5273 INTERPCOMMAND(BlendSubtract)
5274 INTERPCOMMAND(DepthMask)
5275 INTERPCOMMAND(DepthFunc)
5276 INTERPCOMMAND(DepthRange)
5277 INTERPCOMMAND(PolygonOffset)
5278 INTERPCOMMAND(CullFace)
5279 INTERPCOMMAND(AlphaTest)
5280 INTERPCOMMAND(AlphaFunc)
5281 INTERPCOMMAND(SetTexture)
5282 INTERPCOMMAND(SetShader)
5283 INTERPCOMMAND(Uniform4f)
5284 INTERPCOMMAND(UniformMatrix4f)
5285 INTERPCOMMAND(Uniform1i)
5286 INTERPCOMMAND(SetRenderTargets)
5287 INTERPCOMMAND(ClipPlane)
5289 case DPSOFTRAST_OPCODE_Draw:
5290 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5291 commandoffset += command->commandsize;
5292 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5294 thread->commandoffset = commandoffset;
5297 case DPSOFTRAST_OPCODE_Reset:
5302 thread->commandoffset = commandoffset;
5305 static int DPSOFTRAST_Draw_Thread(void *data)
5307 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5308 while(thread->index >= 0)
5310 if (thread->commandoffset != dpsoftrast.drawcommand)
5312 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5316 Thread_LockMutex(thread->drawmutex);
5317 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5319 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5320 thread->starving = true;
5321 Thread_CondWait(thread->drawcond, thread->drawmutex);
5322 thread->starving = false;
5324 Thread_UnlockMutex(thread->drawmutex);
5330 static void DPSOFTRAST_Draw_FlushThreads(void)
5332 DPSOFTRAST_State_Thread *thread;
5334 DPSOFTRAST_Draw_SyncCommands();
5335 if (dpsoftrast.usethreads)
5337 for (i = 0; i < dpsoftrast.numthreads; i++)
5339 thread = &dpsoftrast.threads[i];
5340 if (thread->commandoffset != dpsoftrast.drawcommand)
5342 Thread_LockMutex(thread->drawmutex);
5343 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5344 Thread_CondSignal(thread->drawcond);
5345 Thread_UnlockMutex(thread->drawmutex);
5348 for (i = 0; i < dpsoftrast.numthreads; i++)
5350 thread = &dpsoftrast.threads[i];
5351 if (thread->commandoffset != dpsoftrast.drawcommand)
5353 Thread_LockMutex(thread->drawmutex);
5354 if (thread->commandoffset != dpsoftrast.drawcommand)
5356 thread->waiting = true;
5357 Thread_CondWait(thread->waitcond, thread->drawmutex);
5358 thread->waiting = false;
5360 Thread_UnlockMutex(thread->drawmutex);
5366 for (i = 0; i < dpsoftrast.numthreads; i++)
5368 thread = &dpsoftrast.threads[i];
5369 if (thread->commandoffset != dpsoftrast.drawcommand)
5370 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5373 dpsoftrast.commandpool.usedcommands = 0;
5376 void DPSOFTRAST_Flush(void)
5378 DPSOFTRAST_Draw_FlushThreads();
5381 void DPSOFTRAST_Finish(void)
5386 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5396 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5397 dpsoftrast.bigendian = u.b[3];
5398 dpsoftrast.fb_width = width;
5399 dpsoftrast.fb_height = height;
5400 dpsoftrast.fb_depthpixels = depthpixels;
5401 dpsoftrast.fb_colorpixels[0] = colorpixels;
5402 dpsoftrast.fb_colorpixels[1] = NULL;
5403 dpsoftrast.fb_colorpixels[1] = NULL;
5404 dpsoftrast.fb_colorpixels[1] = NULL;
5405 dpsoftrast.viewport[0] = 0;
5406 dpsoftrast.viewport[1] = 0;
5407 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5408 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5409 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5410 dpsoftrast.texture_firstfree = 1;
5411 dpsoftrast.texture_end = 1;
5412 dpsoftrast.texture_max = 0;
5413 dpsoftrast.color[0] = 1;
5414 dpsoftrast.color[1] = 1;
5415 dpsoftrast.color[2] = 1;
5416 dpsoftrast.color[3] = 1;
5417 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5418 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5419 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5420 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5421 for (i = 0; i < dpsoftrast.numthreads; i++)
5423 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5425 thread->cullface = GL_BACK;
5426 thread->colormask[0] = 1;
5427 thread->colormask[1] = 1;
5428 thread->colormask[2] = 1;
5429 thread->colormask[3] = 1;
5430 thread->blendfunc[0] = GL_ONE;
5431 thread->blendfunc[1] = GL_ZERO;
5432 thread->depthmask = true;
5433 thread->depthtest = true;
5434 thread->depthfunc = GL_LEQUAL;
5435 thread->scissortest = false;
5436 thread->alphatest = false;
5437 thread->alphafunc = GL_GREATER;
5438 thread->alphavalue = 0.5f;
5439 thread->viewport[0] = 0;
5440 thread->viewport[1] = 0;
5441 thread->viewport[2] = dpsoftrast.fb_width;
5442 thread->viewport[3] = dpsoftrast.fb_height;
5443 thread->scissor[0] = 0;
5444 thread->scissor[1] = 0;
5445 thread->scissor[2] = dpsoftrast.fb_width;
5446 thread->scissor[3] = dpsoftrast.fb_height;
5447 thread->depthrange[0] = 0;
5448 thread->depthrange[1] = 1;
5449 thread->polygonoffset[0] = 0;
5450 thread->polygonoffset[1] = 0;
5451 thread->clipplane[0] = 0;
5452 thread->clipplane[1] = 0;
5453 thread->clipplane[2] = 0;
5454 thread->clipplane[3] = 1;
5456 thread->numspans = 0;
5457 thread->numtriangles = 0;
5458 thread->commandoffset = 0;
5459 thread->waiting = false;
5460 thread->starving = false;
5462 thread->validate = -1;
5463 DPSOFTRAST_Validate(thread, -1);
5465 if (dpsoftrast.usethreads)
5467 thread->waitcond = Thread_CreateCond();
5468 thread->drawcond = Thread_CreateCond();
5469 thread->drawmutex = Thread_CreateMutex();
5470 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5476 void DPSOFTRAST_Shutdown(void)
5479 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5481 DPSOFTRAST_State_Thread *thread;
5482 for (i = 0; i < dpsoftrast.numthreads; i++)
5484 thread = &dpsoftrast.threads[i];
5485 Thread_LockMutex(thread->drawmutex);
5487 Thread_CondSignal(thread->drawcond);
5488 Thread_UnlockMutex(thread->drawmutex);
5489 Thread_WaitThread(thread->thread, 0);
5490 Thread_DestroyCond(thread->waitcond);
5491 Thread_DestroyCond(thread->drawcond);
5492 Thread_DestroyMutex(thread->drawmutex);
5495 for (i = 0;i < dpsoftrast.texture_end;i++)
5496 if (dpsoftrast.texture[i].bytes)
5497 MM_FREE(dpsoftrast.texture[i].bytes);
5498 if (dpsoftrast.texture)
5499 free(dpsoftrast.texture);
5500 if (dpsoftrast.threads)
5501 MM_FREE(dpsoftrast.threads);
5502 memset(&dpsoftrast, 0, sizeof(dpsoftrast));