3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
240 int shader_permutation;
241 int shader_exactspecularmath;
243 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248 // DPSOFTRAST_VALIDATE_ flags
251 // derived values (DPSOFTRAST_VALIDATE_FB)
254 ALIGN(float fb_viewportcenter[4]);
255 ALIGN(float fb_viewportscale[4]);
257 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
260 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
269 ATOMIC(volatile int commandoffset);
271 volatile bool waiting;
272 volatile bool starving;
279 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 DPSOFTRAST_State_Thread);
284 typedef ATOMIC(struct DPSOFTRAST_State_s
288 unsigned int *fb_depthpixels;
289 unsigned int *fb_colorpixels[4];
292 ALIGN(float fb_viewportcenter[4]);
293 ALIGN(float fb_viewportscale[4]);
296 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299 const float *pointer_vertex3f;
300 const float *pointer_color4f;
301 const unsigned char *pointer_color4ub;
302 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
305 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
311 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312 float *screencoord4f;
318 int shader_permutation;
319 int shader_exactspecularmath;
323 int texture_firstfree;
324 DPSOFTRAST_Texture *texture;
329 const char *errorstring;
334 DPSOFTRAST_State_Thread *threads;
336 ATOMIC(volatile int drawcommand);
338 DPSOFTRAST_State_Command_Pool commandpool;
342 DPSOFTRAST_State dpsoftrast;
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354 fb_viewportcenter[3] = 0.5f;
355 fb_viewportcenter[0] = 0.0f;
356 fb_viewportscale[1] = 0.5f * viewport[2];
357 fb_viewportscale[2] = -0.5f * viewport[3];
358 fb_viewportscale[3] = 0.5f;
359 fb_viewportscale[0] = 1.0f;
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 if (dpsoftrast.interlace)
366 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
373 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381 // and viewport projection values
384 x1 = thread->scissor[0];
385 x2 = thread->scissor[0] + thread->scissor[2];
386 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387 y2 = dpsoftrast.fb_height - thread->scissor[1];
388 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393 thread->fb_scissor[0] = x1;
394 thread->fb_scissor[1] = y1;
395 thread->fb_scissor[2] = x2 - x1;
396 thread->fb_scissor[3] = y2 - y1;
398 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399 DPSOFTRAST_RecalcThread(thread);
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 if (thread->blendsubtract)
411 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
421 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 mask &= thread->validate;
445 if (mask & DPSOFTRAST_VALIDATE_FB)
447 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448 DPSOFTRAST_RecalcFB(thread);
450 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453 DPSOFTRAST_RecalcDepthFunc(thread);
455 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458 DPSOFTRAST_RecalcBlendFunc(thread);
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465 return &dpsoftrast.texture[index];
469 static void DPSOFTRAST_Texture_Grow(void)
471 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472 DPSOFTRAST_State_Thread *thread;
476 // expand texture array as needed
477 if (dpsoftrast.texture_max < 1024)
478 dpsoftrast.texture_max = 1024;
480 dpsoftrast.texture_max *= 2;
481 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483 if (dpsoftrast.texbound[i])
484 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485 for (j = 0; j < dpsoftrast.numthreads; j++)
487 thread = &dpsoftrast.threads[j];
488 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489 if (thread->texbound[i])
490 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
503 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505 DPSOFTRAST_Texture *texture;
506 if (width*height*depth < 1)
508 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
511 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
518 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
533 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
540 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
545 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
550 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
560 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
565 // find first empty slot in texture array
566 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567 if (!dpsoftrast.texture[texnum].bytes)
569 dpsoftrast.texture_firstfree = texnum + 1;
570 if (dpsoftrast.texture_max <= texnum)
571 DPSOFTRAST_Texture_Grow();
572 if (dpsoftrast.texture_end <= texnum)
573 dpsoftrast.texture_end = texnum + 1;
574 texture = &dpsoftrast.texture[texnum];
575 memset(texture, 0, sizeof(*texture));
576 texture->flags = flags;
577 texture->width = width;
578 texture->height = height;
579 texture->depth = depth;
580 texture->sides = sides;
592 s = w * h * d * sides * 4;
593 texture->mipmap[mipmaps][0] = size;
594 texture->mipmap[mipmaps][1] = s;
595 texture->mipmap[mipmaps][2] = w;
596 texture->mipmap[mipmaps][3] = h;
597 texture->mipmap[mipmaps][4] = d;
600 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
606 texture->mipmaps = mipmaps;
607 texture->size = size;
609 // allocate the pixels now
610 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
614 void DPSOFTRAST_Texture_Free(int index)
616 DPSOFTRAST_Texture *texture;
617 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
621 MM_FREE(texture->bytes);
622 texture->bytes = NULL;
623 memset(texture, 0, sizeof(*texture));
624 // adjust the free range and used range
625 if (dpsoftrast.texture_firstfree > index)
626 dpsoftrast.texture_firstfree = index;
627 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628 dpsoftrast.texture_end--;
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 int i, x, y, z, w, layer0, layer1, row0, row1;
633 unsigned char *o, *i0, *i1, *i2, *i3;
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636 if (texture->mipmaps <= 1)
638 for (i = 1;i < texture->mipmaps;i++)
640 for (z = 0;z < texture->mipmap[i][4];z++)
644 if (layer1 >= texture->mipmap[i-1][4])
645 layer1 = texture->mipmap[i-1][4]-1;
646 for (y = 0;y < texture->mipmap[i][3];y++)
650 if (row1 >= texture->mipmap[i-1][3])
651 row1 = texture->mipmap[i-1][3]-1;
652 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
653 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657 w = texture->mipmap[i][2];
660 if (texture->mipmap[i-1][2] > 1)
662 // average 3D texture
663 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
673 // average 3D mipmap with parent width == 1
674 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
685 if (texture->mipmap[i-1][2] > 1)
687 // average 2D texture (common case)
688 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
698 // 2D texture with parent width == 1
699 o[0] = (i0[0] + i1[0] + 1) >> 1;
700 o[1] = (i0[1] + i1[1] + 1) >> 1;
701 o[2] = (i0[2] + i1[2] + 1) >> 1;
702 o[3] = (i0[3] + i1[3] + 1) >> 1;
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 DPSOFTRAST_Texture *texture;
713 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
718 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719 while (blockheight > 0)
721 memcpy(dst, pixels, blockwidth * 4);
722 pixels += blockwidth * 4;
723 dst += texture->mipmap[0][2] * 4;
727 DPSOFTRAST_Texture_CalculateMipmaps(index);
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737 DPSOFTRAST_Texture_CalculateMipmaps(index);
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 DPSOFTRAST_Texture *texture;
742 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743 return texture->mipmap[mip][2];
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 DPSOFTRAST_Texture *texture;
748 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749 return texture->mipmap[mip][3];
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755 return texture->mipmap[mip][4];
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
763 return texture->bytes + texture->mipmap[mip][0];
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 DPSOFTRAST_Texture *texture;
768 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
776 texture->filter = filter;
779 static void DPSOFTRAST_Draw_FlushThreads(void);
781 static void DPSOFTRAST_Draw_SyncCommands(void)
783 if(dpsoftrast.usethreads) MEMORY_BARRIER;
784 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 DPSOFTRAST_State_Thread *thread;
791 int freecommand = dpsoftrast.commandpool.freecommand;
792 int usedcommands = dpsoftrast.commandpool.usedcommands;
793 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795 DPSOFTRAST_Draw_SyncCommands();
801 for (i = 0; i < dpsoftrast.numthreads; i++)
803 thread = &dpsoftrast.threads[i];
804 commandoffset = freecommand - thread->commandoffset;
805 if (commandoffset < 0)
806 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807 if (commandoffset > usedcommands)
810 usedcommands = commandoffset;
813 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815 thread = &dpsoftrast.threads[waitindex];
816 Thread_LockMutex(thread->drawmutex);
817 if (thread->commandoffset != dpsoftrast.drawcommand)
819 thread->waiting = true;
820 if (thread->starving) Thread_CondSignal(thread->drawcond);
821 Thread_CondWait(thread->waitcond, thread->drawmutex);
822 thread->waiting = false;
824 Thread_UnlockMutex(thread->drawmutex);
826 dpsoftrast.commandpool.usedcommands = usedcommands;
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 DPSOFTRAST_Command *command;
837 int freecommand = dpsoftrast.commandpool.freecommand;
838 int usedcommands = dpsoftrast.commandpool.usedcommands;
839 int extra = sizeof(DPSOFTRAST_Command);
840 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844 if (dpsoftrast.usethreads)
845 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847 DPSOFTRAST_Draw_FlushThreads();
848 freecommand = dpsoftrast.commandpool.freecommand;
849 usedcommands = dpsoftrast.commandpool.usedcommands;
851 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854 command->opcode = DPSOFTRAST_OPCODE_Reset;
855 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
858 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859 command->opcode = opcode;
860 command->commandsize = size;
862 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864 dpsoftrast.commandpool.freecommand = freecommand;
865 dpsoftrast.commandpool.usedcommands = usedcommands + size;
869 static void DPSOFTRAST_UndoCommand(int size)
871 int freecommand = dpsoftrast.commandpool.freecommand;
872 int usedcommands = dpsoftrast.commandpool.usedcommands;
875 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876 usedcommands -= size;
877 dpsoftrast.commandpool.freecommand = freecommand;
878 dpsoftrast.commandpool.usedcommands = usedcommands;
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 thread->viewport[0] = command->x;
885 thread->viewport[1] = command->y;
886 thread->viewport[2] = command->width;
887 thread->viewport[3] = command->height;
888 thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
895 command->width = width;
896 command->height = height;
898 dpsoftrast.viewport[0] = x;
899 dpsoftrast.viewport[1] = y;
900 dpsoftrast.viewport[2] = width;
901 dpsoftrast.viewport[3] = height;
902 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 int i, x1, y1, x2, y2, w, h, x, y;
909 int miny1, maxy1, miny2, maxy2;
913 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914 miny1 = thread->miny1;
915 maxy1 = thread->maxy1;
916 miny2 = thread->miny2;
917 maxy2 = thread->maxy2;
918 x1 = thread->fb_scissor[0];
919 y1 = thread->fb_scissor[1];
920 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922 if (y1 < miny1) y1 = miny1;
923 if (y2 > maxy2) y2 = maxy2;
928 // FIXME: honor fb_colormask?
929 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930 for (i = 0;i < 4;i++)
932 if (!dpsoftrast.fb_colorpixels[i])
934 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
937 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938 for (x = x1;x < x2;x++)
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 int x1, y1, x2, y2, w, h, x, y;
956 int miny1, maxy1, miny2, maxy2;
960 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961 miny1 = thread->miny1;
962 maxy1 = thread->maxy1;
963 miny2 = thread->miny2;
964 maxy2 = thread->maxy2;
965 x1 = thread->fb_scissor[0];
966 y1 = thread->fb_scissor[1];
967 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969 if (y1 < miny1) y1 = miny1;
970 if (y2 > maxy2) y2 = maxy2;
975 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
984 void DPSOFTRAST_ClearDepth(float d)
986 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 thread->colormask[0] = command->r != 0;
994 thread->colormask[1] = command->g != 0;
995 thread->colormask[2] = command->b != 0;
996 thread->colormask[3] = command->a != 0;
997 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 thread->depthtest = command->enable;
1012 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 void DPSOFTRAST_DepthTest(int enable)
1016 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017 command->enable = enable;
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 thread->scissortest = command->enable;
1024 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 void DPSOFTRAST_ScissorTest(int enable)
1028 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029 command->enable = enable;
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 thread->scissor[0] = command->x;
1036 thread->scissor[1] = command->y;
1037 thread->scissor[2] = command->width;
1038 thread->scissor[3] = command->height;
1039 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1046 command->width = width;
1047 command->height = height;
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 thread->blendfunc[0] = command->sfactor;
1054 thread->blendfunc[1] = command->dfactor;
1055 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060 command->sfactor = sfactor;
1061 command->dfactor = dfactor;
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 thread->blendsubtract = command->enable;
1068 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 void DPSOFTRAST_BlendSubtract(int enable)
1072 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073 command->enable = enable;
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 thread->depthmask = command->enable;
1081 void DPSOFTRAST_DepthMask(int enable)
1083 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084 command->enable = enable;
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 thread->depthfunc = command->func;
1092 void DPSOFTRAST_DepthFunc(int func)
1094 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095 command->func = func;
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 thread->depthrange[0] = command->nearval;
1102 thread->depthrange[1] = command->farval;
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107 command->nearval = nearval;
1108 command->farval = farval;
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 thread->polygonoffset[0] = command->alongnormal;
1115 thread->polygonoffset[1] = command->intoview;
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120 command->alongnormal = alongnormal;
1121 command->intoview = intoview;
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 thread->cullface = command->mode;
1129 void DPSOFTRAST_CullFace(int mode)
1131 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132 command->mode = mode;
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 thread->alphatest = command->enable;
1140 void DPSOFTRAST_AlphaTest(int enable)
1142 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143 command->enable = enable;
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 thread->alphafunc = command->func;
1150 thread->alphavalue = command->ref;
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155 command->func = func;
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 dpsoftrast.color[0] = r;
1162 dpsoftrast.color[1] = g;
1163 dpsoftrast.color[2] = b;
1164 dpsoftrast.color[3] = a;
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 int outstride = blockwidth * 4;
1170 int instride = dpsoftrast.fb_width * 4;
1173 int bx2 = blockx + blockwidth;
1174 int by2 = blocky + blockheight;
1178 unsigned char *inpixels;
1182 if (bx1 < 0) bx1 = 0;
1183 if (by1 < 0) by1 = 0;
1184 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188 if (dpsoftrast.bigendian)
1190 for (y = by1;y < by2;y++)
1192 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193 o = (unsigned char *)outpixels + (y - by1) * outstride;
1194 for (x = bx1;x < bx2;x++)
1207 for (y = by1;y < by2;y++)
1209 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1220 int tx2 = tx + width;
1221 int ty2 = ty + height;
1224 int sx2 = sx + width;
1225 int sy2 = sy + height;
1235 unsigned int *spixels;
1236 unsigned int *tpixels;
1237 DPSOFTRAST_Texture *texture;
1238 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239 if (mip < 0 || mip >= texture->mipmaps) return;
1241 spixels = dpsoftrast.fb_colorpixels[0];
1242 swidth = dpsoftrast.fb_width;
1243 sheight = dpsoftrast.fb_height;
1244 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245 twidth = texture->mipmap[mip][2];
1246 theight = texture->mipmap[mip][3];
1247 if (tx1 < 0) tx1 = 0;
1248 if (ty1 < 0) ty1 = 0;
1249 if (tx2 > twidth) tx2 = twidth;
1250 if (ty2 > theight) ty2 = theight;
1251 if (sx1 < 0) sx1 = 0;
1252 if (sy1 < 0) sy1 = 0;
1253 if (sx2 > swidth) sx2 = swidth;
1254 if (sy2 > sheight) sy2 = sheight;
1259 if (tw > sw) tw = sw;
1260 if (th > sh) th = sh;
1261 if (tw < 1 || th < 1)
1263 sy1 = sheight - 1 - sy1;
1264 for (y = 0;y < th;y++)
1265 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266 if (texture->mipmaps > 1)
1267 DPSOFTRAST_Texture_CalculateMipmaps(index);
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 if (thread->texbound[command->unitnum])
1274 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275 thread->texbound[command->unitnum] = command->texture;
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 DPSOFTRAST_Command_SetTexture *command;
1280 DPSOFTRAST_Texture *texture;
1281 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1286 texture = DPSOFTRAST_Texture_GetByIndex(index);
1287 if (index && !texture)
1289 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1293 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294 command->unitnum = unitnum;
1295 command->texture = texture;
1297 dpsoftrast.texbound[unitnum] = texture;
1298 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 dpsoftrast.pointer_vertex3f = vertex3f;
1304 dpsoftrast.stride_vertex = stride;
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 dpsoftrast.pointer_color4f = color4f;
1309 dpsoftrast.pointer_color4ub = NULL;
1310 dpsoftrast.stride_color = stride;
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 dpsoftrast.pointer_color4f = NULL;
1315 dpsoftrast.pointer_color4ub = color4ub;
1316 dpsoftrast.stride_color = stride;
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322 dpsoftrast.stride_texcoord[unitnum] = stride;
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 thread->shader_mode = command->mode;
1329 thread->shader_permutation = command->permutation;
1330 thread->shader_exactspecularmath = command->exactspecularmath;
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335 command->mode = mode;
1336 command->permutation = permutation;
1337 command->exactspecularmath = exactspecularmath;
1339 dpsoftrast.shader_mode = mode;
1340 dpsoftrast.shader_permutation = permutation;
1341 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352 command->index = index;
1353 command->val[0] = v0;
1354 command->val[1] = v1;
1355 command->val[2] = v2;
1356 command->val[3] = v3;
1358 dpsoftrast.uniform4f[index*4+0] = v0;
1359 dpsoftrast.uniform4f[index*4+1] = v1;
1360 dpsoftrast.uniform4f[index*4+2] = v2;
1361 dpsoftrast.uniform4f[index*4+3] = v3;
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366 command->index = index;
1367 memcpy(command->val, v, sizeof(command->val));
1369 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1381 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383 __m128 m0, m1, m2, m3;
1384 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385 command->index = (DPSOFTRAST_UNIFORM)index;
1386 if (((size_t)v)&(ALIGN_SIZE-1))
1388 m0 = _mm_loadu_ps(v);
1389 m1 = _mm_loadu_ps(v+4);
1390 m2 = _mm_loadu_ps(v+8);
1391 m3 = _mm_loadu_ps(v+12);
1395 m0 = _mm_load_ps(v);
1396 m1 = _mm_load_ps(v+4);
1397 m2 = _mm_load_ps(v+8);
1398 m3 = _mm_load_ps(v+12);
1402 __m128 t0, t1, t2, t3;
1403 t0 = _mm_unpacklo_ps(m0, m1);
1404 t1 = _mm_unpacklo_ps(m2, m3);
1405 t2 = _mm_unpackhi_ps(m0, m1);
1406 t3 = _mm_unpackhi_ps(m2, m3);
1407 m0 = _mm_movelh_ps(t0, t1);
1408 m1 = _mm_movehl_ps(t1, t0);
1409 m2 = _mm_movelh_ps(t2, t3);
1410 m3 = _mm_movehl_ps(t3, t2);
1412 _mm_store_ps(command->val, m0);
1413 _mm_store_ps(command->val+4, m1);
1414 _mm_store_ps(command->val+8, m2);
1415 _mm_store_ps(command->val+12, m3);
1416 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 thread->uniform1i[command->index] = command->val;
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432 command->index = index;
1435 dpsoftrast.uniform1i[command->index] = i0;
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1441 float *end = dst + size*4;
1442 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1446 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1455 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 float *end = dst + size*4;
1465 if (stride == sizeof(float[3]))
1467 float *end4 = dst + (size&~3)*4;
1468 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1472 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1473 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486 src += 4*sizeof(float[3]);
1493 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 src += 4*sizeof(float[3]);
1511 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1515 __m128 v = _mm_loadu_ps((const float *)src);
1516 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519 _mm_store_ps(dst, v);
1528 __m128 v = _mm_load_ps((const float *)src);
1529 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532 _mm_store_ps(dst, v);
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1541 float *end = dst + size*4;
1542 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543 if (stride == sizeof(float[2]))
1545 float *end2 = dst + (size&~1)*4;
1546 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1554 src += 2*sizeof(float[2]);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1565 src += 2*sizeof(float[2]);
1571 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1579 float *end = dst + size*4;
1580 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581 if (stride == sizeof(unsigned char[4]))
1583 float *end4 = dst + (size&~3)*4;
1584 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1588 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1607 src += 4*sizeof(unsigned char[4]);
1613 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1622 float *end = dst + 4*size;
1623 __m128 v = _mm_loadu_ps(src);
1626 _mm_store_ps(dst, v);
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1635 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636 __m128 m0, m1, m2, m3;
1638 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1640 // fast case for identity matrix
1641 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1644 end = out4f + numitems*4;
1645 m0 = _mm_loadu_ps(inmatrix16f);
1646 m1 = _mm_loadu_ps(inmatrix16f + 4);
1647 m2 = _mm_loadu_ps(inmatrix16f + 8);
1648 m3 = _mm_loadu_ps(inmatrix16f + 12);
1649 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1653 __m128 v = _mm_loadu_ps(in4f);
1655 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1667 __m128 v = _mm_load_ps(in4f);
1669 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1682 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1688 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1696 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1705 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1713 int clipmask = 0xFF;
1714 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722 #define BBFRONT(k, pos) \
1724 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1729 clipmask &= ~(1<<k); \
1730 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1736 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1737 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1738 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1739 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1740 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1741 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1745 if (clipmask&(1<<k)) \
1747 if (!(clipmask&(1<<(k^1)))) \
1749 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752 minproj = _mm_min_ss(minproj, proj); \
1753 maxproj = _mm_max_ss(maxproj, proj); \
1755 if (!(clipmask&(1<<(k^2)))) \
1757 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760 minproj = _mm_min_ss(minproj, proj); \
1761 maxproj = _mm_max_ss(maxproj, proj); \
1763 if (!(clipmask&(1<<(k^4)))) \
1765 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780 *starty = _mm_cvttss_si32(maxproj);
1781 *endy = _mm_cvttss_si32(minproj)+1;
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1787 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788 float *end = out4f + numitems*4;
1789 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790 __m128 minpos, maxpos;
1791 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1793 minpos = maxpos = _mm_loadu_ps(in4f);
1796 __m128 v = _mm_loadu_ps(in4f);
1797 minpos = _mm_min_ps(minpos, v);
1798 maxpos = _mm_max_ps(maxpos, v);
1799 _mm_store_ps(out4f, v);
1800 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801 _mm_store_ps(screen4f, v);
1809 minpos = maxpos = _mm_load_ps(in4f);
1812 __m128 v = _mm_load_ps(in4f);
1813 minpos = _mm_min_ps(minpos, v);
1814 maxpos = _mm_max_ps(maxpos, v);
1815 _mm_store_ps(out4f, v);
1816 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817 _mm_store_ps(screen4f, v);
1825 ALIGN(float minposf[4]);
1826 ALIGN(float maxposf[4]);
1827 _mm_store_ps(minposf, minpos);
1828 _mm_store_ps(maxposf, maxpos);
1829 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1836 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1839 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841 end = out4f + numitems*4;
1842 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844 m0 = _mm_loadu_ps(inmatrix16f);
1845 m1 = _mm_loadu_ps(inmatrix16f + 4);
1846 m2 = _mm_loadu_ps(inmatrix16f + 8);
1847 m3 = _mm_loadu_ps(inmatrix16f + 12);
1848 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1850 minpos = maxpos = _mm_loadu_ps(in4f);
1853 __m128 v = _mm_loadu_ps(in4f);
1854 minpos = _mm_min_ps(minpos, v);
1855 maxpos = _mm_max_ps(maxpos, v);
1856 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857 _mm_store_ps(out4f, v);
1858 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859 _mm_store_ps(screen4f, v);
1867 minpos = maxpos = _mm_load_ps(in4f);
1870 __m128 v = _mm_load_ps(in4f);
1871 minpos = _mm_min_ps(minpos, v);
1872 maxpos = _mm_max_ps(maxpos, v);
1873 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874 _mm_store_ps(out4f, v);
1875 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876 _mm_store_ps(screen4f, v);
1884 ALIGN(float minposf[4]);
1885 ALIGN(float maxposf[4]);
1886 _mm_store_ps(minposf, minpos);
1887 _mm_store_ps(maxposf, maxpos);
1888 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1897 float *outf = dpsoftrast.post_array4f[outarray];
1898 const unsigned char *inb;
1899 int firstvertex = dpsoftrast.firstvertex;
1900 int numvertices = dpsoftrast.numvertices;
1904 case DPSOFTRAST_ARRAY_POSITION:
1905 stride = dpsoftrast.stride_vertex;
1906 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1909 case DPSOFTRAST_ARRAY_COLOR:
1910 stride = dpsoftrast.stride_color;
1911 if (dpsoftrast.pointer_color4f)
1913 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916 else if (dpsoftrast.pointer_color4ub)
1918 stride = dpsoftrast.stride_color;
1919 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1924 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1928 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1931 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1935 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1938 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1955 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1964 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1976 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1987 int startx = span->startx;
1988 int endx = span->endx;
1989 float wslope = triangle->w[0];
1990 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991 float endz = 1.0f / (w + wslope * startx);
1992 if (triangle->w[0] == 0)
1994 // LordHavoc: fast flat polygons (HUD/menu)
1995 for (x = startx;x < endx;x++)
1999 for (x = startx;x < endx;)
2001 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2003 if (nextsub >= endx) nextsub = endsub = endx-1;
2004 endz = 1.0f / (w + wslope * nextsub);
2005 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2006 for (; x <= endsub; x++, z += dz)
2011 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2014 int startx = span->startx;
2015 int endx = span->endx;
2018 unsigned char * RESTRICT pixelmask = span->pixelmask;
2019 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2022 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2023 // handle alphatest now (this affects depth writes too)
2024 if (thread->alphatest)
2025 for (x = startx;x < endx;x++)
2026 if (in4f[x*4+3] < 0.5f)
2027 pixelmask[x] = false;
2028 // FIXME: this does not handle bigendian
2029 switch(thread->fb_blendmode)
2031 case DPSOFTRAST_BLENDMODE_OPAQUE:
2032 for (x = startx;x < endx;x++)
2036 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2037 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2038 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2039 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2040 pixel[x*4+0] = d[0];
2041 pixel[x*4+1] = d[1];
2042 pixel[x*4+2] = d[2];
2043 pixel[x*4+3] = d[3];
2046 case DPSOFTRAST_BLENDMODE_ALPHA:
2047 for (x = startx;x < endx;x++)
2051 a = in4f[x*4+3] * 255.0f;
2052 b = 1.0f - in4f[x*4+3];
2053 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064 for (x = startx;x < endx;x++)
2068 a = in4f[x*4+3] * 255.0f;
2069 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2070 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2071 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2072 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2073 pixel[x*4+0] = d[0];
2074 pixel[x*4+1] = d[1];
2075 pixel[x*4+2] = d[2];
2076 pixel[x*4+3] = d[3];
2079 case DPSOFTRAST_BLENDMODE_ADD:
2080 for (x = startx;x < endx;x++)
2084 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2085 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2086 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2087 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2088 pixel[x*4+0] = d[0];
2089 pixel[x*4+1] = d[1];
2090 pixel[x*4+2] = d[2];
2091 pixel[x*4+3] = d[3];
2094 case DPSOFTRAST_BLENDMODE_INVMOD:
2095 for (x = startx;x < endx;x++)
2099 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2100 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2101 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2102 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2103 pixel[x*4+0] = d[0];
2104 pixel[x*4+1] = d[1];
2105 pixel[x*4+2] = d[2];
2106 pixel[x*4+3] = d[3];
2109 case DPSOFTRAST_BLENDMODE_MUL:
2110 for (x = startx;x < endx;x++)
2114 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2115 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2116 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2117 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2118 pixel[x*4+0] = d[0];
2119 pixel[x*4+1] = d[1];
2120 pixel[x*4+2] = d[2];
2121 pixel[x*4+3] = d[3];
2124 case DPSOFTRAST_BLENDMODE_MUL2:
2125 for (x = startx;x < endx;x++)
2129 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2130 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2131 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2132 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2133 pixel[x*4+0] = d[0];
2134 pixel[x*4+1] = d[1];
2135 pixel[x*4+2] = d[2];
2136 pixel[x*4+3] = d[3];
2139 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2140 for (x = startx;x < endx;x++)
2144 a = in4f[x*4+3] * -255.0f;
2145 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2146 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2147 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2148 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2149 pixel[x*4+0] = d[0];
2150 pixel[x*4+1] = d[1];
2151 pixel[x*4+2] = d[2];
2152 pixel[x*4+3] = d[3];
2155 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2156 for (x = startx;x < endx;x++)
2161 b = 1.0f - in4f[x*4+3];
2162 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2163 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2164 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2165 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2166 pixel[x*4+0] = d[0];
2167 pixel[x*4+1] = d[1];
2168 pixel[x*4+2] = d[2];
2169 pixel[x*4+3] = d[3];
2172 case DPSOFTRAST_BLENDMODE_INVADD:
2173 for (x = startx;x < endx;x++)
2177 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2178 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2179 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2180 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2181 pixel[x*4+0] = d[0];
2182 pixel[x*4+1] = d[1];
2183 pixel[x*4+2] = d[2];
2184 pixel[x*4+3] = d[3];
2190 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2194 int startx = span->startx;
2195 int endx = span->endx;
2197 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2198 unsigned char * RESTRICT pixelmask = span->pixelmask;
2199 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2200 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2203 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2204 pixeli += span->y * dpsoftrast.fb_width + span->x;
2205 // handle alphatest now (this affects depth writes too)
2206 if (thread->alphatest)
2207 for (x = startx;x < endx;x++)
2208 if (in4ub[x*4+3] < 128)
2209 pixelmask[x] = false;
2210 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2211 // helps sprites, text and hud artwork
2212 switch(thread->fb_blendmode)
2214 case DPSOFTRAST_BLENDMODE_ALPHA:
2215 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2216 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2217 for (x = startx;x < endx;x++)
2218 if (in4ub[x*4+3] < 1)
2219 pixelmask[x] = false;
2221 case DPSOFTRAST_BLENDMODE_OPAQUE:
2222 case DPSOFTRAST_BLENDMODE_ADD:
2223 case DPSOFTRAST_BLENDMODE_INVMOD:
2224 case DPSOFTRAST_BLENDMODE_MUL:
2225 case DPSOFTRAST_BLENDMODE_MUL2:
2226 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2227 case DPSOFTRAST_BLENDMODE_INVADD:
2230 // put some special values at the end of the mask to ensure the loops end
2231 pixelmask[endx] = 1;
2232 pixelmask[endx+1] = 0;
2233 // LordHavoc: use a double loop to identify subspans, this helps the
2234 // optimized copy/blend loops to perform at their best, most triangles
2235 // have only one run of pixels, and do the search using wide reads...
2239 // if this pixel is masked off, it's probably not alone...
2246 // the 4-item search must be aligned or else it stalls badly
2247 if ((x & 3) && !pixelmask[x]) x++;
2248 if ((x & 3) && !pixelmask[x]) x++;
2249 if ((x & 3) && !pixelmask[x]) x++;
2250 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2254 for (;!pixelmask[x];x++)
2256 // rather than continue the loop, just check the end variable
2260 // find length of subspan
2265 if ((subx & 3) && pixelmask[subx]) subx++;
2266 if ((subx & 3) && pixelmask[subx]) subx++;
2267 if ((subx & 3) && pixelmask[subx]) subx++;
2268 while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2272 for (;pixelmask[subx];subx++)
2274 // the checks can overshoot, so make sure to clip it...
2277 // now that we know the subspan length... process!
2278 switch(thread->fb_blendmode)
2280 case DPSOFTRAST_BLENDMODE_OPAQUE:
2284 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2289 while (x + 16 <= subx)
2291 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2292 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2293 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2294 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2299 while (x + 4 <= subx)
2301 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2307 pixeli[x+1] = ini[x+1];
2317 case DPSOFTRAST_BLENDMODE_ALPHA:
2318 #define FINISHBLEND(blend2, blend1) \
2319 for (;x + 1 < subx;x += 2) \
2322 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2323 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2325 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2330 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2331 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2333 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2337 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2338 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2340 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2341 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2344 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2346 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2347 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2349 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2350 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2353 case DPSOFTRAST_BLENDMODE_ADD:
2354 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2356 case DPSOFTRAST_BLENDMODE_INVMOD:
2358 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2360 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2363 case DPSOFTRAST_BLENDMODE_MUL:
2364 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2366 case DPSOFTRAST_BLENDMODE_MUL2:
2367 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2369 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2371 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2372 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2374 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2375 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2378 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2380 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2381 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2383 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2384 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2387 case DPSOFTRAST_BLENDMODE_INVADD:
2389 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2391 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2399 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2402 int startx = span->startx;
2403 int endx = span->endx;
2408 float tc[2], endtc[2];
2410 unsigned int tci[2];
2411 unsigned int tci1[2];
2412 unsigned int tcimin[2];
2413 unsigned int tcimax[2];
2418 const unsigned char * RESTRICT pixelbase;
2419 const unsigned char * RESTRICT pixel[4];
2420 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2421 // if no texture is bound, just fill it with white
2424 for (x = startx;x < endx;x++)
2426 out4f[x*4+0] = 1.0f;
2427 out4f[x*4+1] = 1.0f;
2428 out4f[x*4+2] = 1.0f;
2429 out4f[x*4+3] = 1.0f;
2433 mip = triangle->mip[texunitindex];
2434 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2435 // if this mipmap of the texture is 1 pixel, just fill it with that color
2436 if (texture->mipmap[mip][1] == 4)
2438 c[0] = texture->bytes[2] * (1.0f/255.0f);
2439 c[1] = texture->bytes[1] * (1.0f/255.0f);
2440 c[2] = texture->bytes[0] * (1.0f/255.0f);
2441 c[3] = texture->bytes[3] * (1.0f/255.0f);
2442 for (x = startx;x < endx;x++)
2444 out4f[x*4+0] = c[0];
2445 out4f[x*4+1] = c[1];
2446 out4f[x*4+2] = c[2];
2447 out4f[x*4+3] = c[3];
2451 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2452 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2453 flags = texture->flags;
2454 tcscale[0] = texture->mipmap[mip][2];
2455 tcscale[1] = texture->mipmap[mip][3];
2456 tciwidth = texture->mipmap[mip][2];
2459 tcimax[0] = texture->mipmap[mip][2]-1;
2460 tcimax[1] = texture->mipmap[mip][3]-1;
2461 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2462 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2463 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2464 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2470 for (x = startx;x < endx;)
2472 unsigned int subtc[2];
2473 unsigned int substep[2];
2474 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2475 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2476 if (nextsub >= endx)
2478 nextsub = endsub = endx-1;
2479 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2483 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2484 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2490 substep[0] = (endtc[0] - tc[0]) * subscale;
2491 substep[1] = (endtc[1] - tc[1]) * subscale;
2492 subtc[0] = tc[0] * (1<<12);
2493 subtc[1] = tc[1] * (1<<12);
2496 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2498 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2500 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2501 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2502 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2503 tci[0] = subtc[0]>>12;
2504 tci[1] = subtc[1]>>12;
2505 tci1[0] = tci[0] + 1;
2506 tci1[1] = tci[1] + 1;
2507 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2508 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2509 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2510 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2511 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2512 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2513 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2514 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2515 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2516 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2517 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2518 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2519 out4f[x*4+0] = c[0];
2520 out4f[x*4+1] = c[1];
2521 out4f[x*4+2] = c[2];
2522 out4f[x*4+3] = c[3];
2527 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2529 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2530 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2531 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2532 tci[0] = subtc[0]>>12;
2533 tci[1] = subtc[1]>>12;
2534 tci1[0] = tci[0] + 1;
2535 tci1[1] = tci[1] + 1;
2536 tci[0] &= tciwrapmask[0];
2537 tci[1] &= tciwrapmask[1];
2538 tci1[0] &= tciwrapmask[0];
2539 tci1[1] &= tciwrapmask[1];
2540 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2541 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2542 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2543 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2544 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2545 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2546 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2547 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2548 out4f[x*4+0] = c[0];
2549 out4f[x*4+1] = c[1];
2550 out4f[x*4+2] = c[2];
2551 out4f[x*4+3] = c[3];
2555 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2557 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2559 tci[0] = subtc[0]>>12;
2560 tci[1] = subtc[1]>>12;
2561 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2562 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2563 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2564 c[0] = pixel[0][2] * (1.0f / 255.0f);
2565 c[1] = pixel[0][1] * (1.0f / 255.0f);
2566 c[2] = pixel[0][0] * (1.0f / 255.0f);
2567 c[3] = pixel[0][3] * (1.0f / 255.0f);
2568 out4f[x*4+0] = c[0];
2569 out4f[x*4+1] = c[1];
2570 out4f[x*4+2] = c[2];
2571 out4f[x*4+3] = c[3];
2576 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2578 tci[0] = subtc[0]>>12;
2579 tci[1] = subtc[1]>>12;
2580 tci[0] &= tciwrapmask[0];
2581 tci[1] &= tciwrapmask[1];
2582 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2583 c[0] = pixel[0][2] * (1.0f / 255.0f);
2584 c[1] = pixel[0][1] * (1.0f / 255.0f);
2585 c[2] = pixel[0][0] * (1.0f / 255.0f);
2586 c[3] = pixel[0][3] * (1.0f / 255.0f);
2587 out4f[x*4+0] = c[0];
2588 out4f[x*4+1] = c[1];
2589 out4f[x*4+2] = c[2];
2590 out4f[x*4+3] = c[3];
2596 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2600 int startx = span->startx;
2601 int endx = span->endx;
2603 __m128 data, slope, tcscale;
2604 __m128i tcsize, tcmask, tcoffset, tcmax;
2606 __m128i subtc, substep, endsubtc;
2609 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2610 const unsigned char * RESTRICT pixelbase;
2611 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2612 // if no texture is bound, just fill it with white
2615 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2618 mip = triangle->mip[texunitindex];
2619 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2620 // if this mipmap of the texture is 1 pixel, just fill it with that color
2621 if (texture->mipmap[mip][1] == 4)
2623 unsigned int k = *((const unsigned int *)pixelbase);
2624 for (x = startx;x < endx;x++)
2628 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2629 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2630 flags = texture->flags;
2631 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2632 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2633 tcscale = _mm_cvtepi32_ps(tcsize);
2634 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2635 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2636 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2638 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2639 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2640 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2641 tcmax = _mm_packs_epi32(tcmask, tcmask);
2642 for (x = startx;x < endx;)
2644 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2645 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2646 if (nextsub >= endx)
2648 nextsub = endsub = endx-1;
2649 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2653 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2655 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2656 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2657 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2658 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2659 substep = _mm_slli_epi32(substep, 1);
2662 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2663 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2665 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2666 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2668 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2669 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2670 tci = _mm_madd_epi16(tci, tcoffset);
2671 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2672 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2673 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2674 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2675 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2676 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2677 fracm = _mm_srli_epi16(subtc, 1);
2678 pix1 = _mm_add_epi16(pix1,
2679 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2680 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2681 pix3 = _mm_add_epi16(pix3,
2682 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2683 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2684 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2685 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2686 pix2 = _mm_add_epi16(pix2,
2687 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2688 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2689 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2693 const unsigned char * RESTRICT ptr1;
2694 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2695 tci = _mm_madd_epi16(tci, tcoffset);
2696 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2697 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2698 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2699 fracm = _mm_srli_epi16(subtc, 1);
2700 pix1 = _mm_add_epi16(pix1,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704 pix1 = _mm_add_epi16(pix1,
2705 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2711 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2713 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2715 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2716 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2717 tci = _mm_madd_epi16(tci, tcoffset);
2718 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2719 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2720 _mm_setzero_si128());
2721 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2722 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2723 _mm_setzero_si128());
2724 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2725 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2726 tci = _mm_madd_epi16(tci, tcoffset);
2727 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2728 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2729 _mm_setzero_si128());
2730 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2731 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2732 _mm_setzero_si128());
2733 fracm = _mm_srli_epi16(subtc, 1);
2734 pix1 = _mm_add_epi16(pix1,
2735 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2736 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2737 pix3 = _mm_add_epi16(pix3,
2738 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2739 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2740 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2741 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2742 pix2 = _mm_add_epi16(pix2,
2743 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2744 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2745 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2749 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2750 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2751 tci = _mm_madd_epi16(tci, tcoffset);
2752 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2753 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2754 _mm_setzero_si128());
2755 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2756 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2757 _mm_setzero_si128());
2758 fracm = _mm_srli_epi16(subtc, 1);
2759 pix1 = _mm_add_epi16(pix1,
2760 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2761 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2762 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2763 pix1 = _mm_add_epi16(pix1,
2764 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2765 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2766 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2772 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2774 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2775 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2776 tci = _mm_madd_epi16(tci, tcoffset);
2777 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2778 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2779 _mm_setzero_si128());
2780 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2781 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2782 _mm_setzero_si128());
2783 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2784 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2785 tci = _mm_madd_epi16(tci, tcoffset);
2786 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2787 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2788 _mm_setzero_si128());
2789 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2790 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2791 _mm_setzero_si128());
2792 fracm = _mm_srli_epi16(subtc, 1);
2793 pix1 = _mm_add_epi16(pix1,
2794 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2795 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2796 pix3 = _mm_add_epi16(pix3,
2797 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2798 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2799 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2800 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2801 pix2 = _mm_add_epi16(pix2,
2802 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2803 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2804 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2808 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2809 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2810 tci = _mm_madd_epi16(tci, tcoffset);
2811 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2812 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2813 _mm_setzero_si128());
2814 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2815 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2816 _mm_setzero_si128());
2817 fracm = _mm_srli_epi16(subtc, 1);
2818 pix1 = _mm_add_epi16(pix1,
2819 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2820 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2821 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2822 pix1 = _mm_add_epi16(pix1,
2823 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2824 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2825 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2832 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2834 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2836 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2837 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2838 tci = _mm_madd_epi16(tci, tcoffset);
2839 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2840 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2844 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2845 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2846 tci = _mm_madd_epi16(tci, tcoffset);
2847 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2853 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2855 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2856 tci = _mm_and_si128(tci, tcmax);
2857 tci = _mm_madd_epi16(tci, tcoffset);
2858 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2859 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2863 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2864 tci = _mm_and_si128(tci, tcmax);
2865 tci = _mm_madd_epi16(tci, tcoffset);
2866 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2875 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2878 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2881 float DPSOFTRAST_SampleShadowmap(const float *vector)
2887 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2890 int startx = span->startx;
2891 int endx = span->endx;
2896 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2897 for (x = startx;x < endx;x++)
2900 c[0] = (data[0] + slope[0]*x) * z;
2901 c[1] = (data[1] + slope[1]*x) * z;
2902 c[2] = (data[2] + slope[2]*x) * z;
2903 c[3] = (data[3] + slope[3]*x) * z;
2904 out4f[x*4+0] = in4f[x*4+0] * c[0];
2905 out4f[x*4+1] = in4f[x*4+1] * c[1];
2906 out4f[x*4+2] = in4f[x*4+2] * c[2];
2907 out4f[x*4+3] = in4f[x*4+3] * c[3];
2911 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2914 int startx = span->startx;
2915 int endx = span->endx;
2920 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2921 for (x = startx;x < endx;x++)
2924 c[0] = (data[0] + slope[0]*x) * z;
2925 c[1] = (data[1] + slope[1]*x) * z;
2926 c[2] = (data[2] + slope[2]*x) * z;
2927 c[3] = (data[3] + slope[3]*x) * z;
2928 out4f[x*4+0] = c[0];
2929 out4f[x*4+1] = c[1];
2930 out4f[x*4+2] = c[2];
2931 out4f[x*4+3] = c[3];
2935 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2937 int x, startx = span->startx, endx = span->endx;
2938 float c[4], localcolor[4];
2939 localcolor[0] = subcolor[0];
2940 localcolor[1] = subcolor[1];
2941 localcolor[2] = subcolor[2];
2942 localcolor[3] = subcolor[3];
2943 for (x = startx;x < endx;x++)
2945 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2946 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2947 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2948 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2949 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2950 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2951 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2952 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2956 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2958 int x, startx = span->startx, endx = span->endx;
2959 for (x = startx;x < endx;x++)
2961 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2962 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2963 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2964 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2968 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2970 int x, startx = span->startx, endx = span->endx;
2971 for (x = startx;x < endx;x++)
2973 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2974 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2975 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2976 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2980 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2982 int x, startx = span->startx, endx = span->endx;
2984 for (x = startx;x < endx;x++)
2986 a = 1.0f - inb4f[x*4+3];
2988 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2989 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2990 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2991 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2995 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2997 int x, startx = span->startx, endx = span->endx;
2998 float localcolor[4], ilerp, lerp;
2999 localcolor[0] = color[0];
3000 localcolor[1] = color[1];
3001 localcolor[2] = color[2];
3002 localcolor[3] = color[3];
3003 ilerp = 1.0f - localcolor[3];
3004 lerp = localcolor[3];
3005 for (x = startx;x < endx;x++)
3007 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3008 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3009 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3010 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3016 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3020 int startx = span->startx;
3021 int endx = span->endx;
3024 __m128i submod, substep, endsubmod;
3025 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3026 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3027 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3028 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3029 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3030 for (x = startx; x < endx;)
3032 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3033 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3034 if (nextsub >= endx)
3036 nextsub = endsub = endx-1;
3037 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3041 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3042 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3043 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3044 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3045 substep = _mm_packs_epi32(substep, substep);
3046 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3048 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3049 pix = _mm_mulhi_epu16(pix, submod);
3050 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3054 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3055 pix = _mm_mulhi_epu16(pix, submod);
3056 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3063 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3067 int startx = span->startx;
3068 int endx = span->endx;
3071 __m128i submod, substep, endsubmod;
3072 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3073 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3074 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3075 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3076 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3077 for (x = startx; x < endx;)
3079 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3080 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3081 if (nextsub >= endx)
3083 nextsub = endsub = endx-1;
3084 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3088 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3089 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3090 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3091 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3092 substep = _mm_packs_epi32(substep, substep);
3093 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3095 __m128i pix = _mm_srai_epi16(submod, 4);
3096 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3100 __m128i pix = _mm_srai_epi16(submod, 4);
3101 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3108 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3111 int x, startx = span->startx, endx = span->endx;
3112 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3113 localcolor = _mm_packs_epi32(localcolor, localcolor);
3114 for (x = startx;x+2 <= endx;x+=2)
3116 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3118 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3119 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3123 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3125 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3126 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3131 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3134 int x, startx = span->startx, endx = span->endx;
3135 for (x = startx;x+2 <= endx;x+=2)
3137 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3139 pix1 = _mm_mulhi_epu16(pix1, pix2);
3140 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3144 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3146 pix1 = _mm_mulhi_epu16(pix1, pix2);
3147 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3152 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3155 int x, startx = span->startx, endx = span->endx;
3156 for (x = startx;x+2 <= endx;x+=2)
3158 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3159 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3160 pix1 = _mm_add_epi16(pix1, pix2);
3161 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3165 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3166 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3167 pix1 = _mm_add_epi16(pix1, pix2);
3168 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3173 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3176 int x, startx = span->startx, endx = span->endx;
3177 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3178 tint = _mm_packs_epi32(tint, tint);
3179 for (x = startx;x+2 <= endx;x+=2)
3181 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3182 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3183 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3184 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3188 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3189 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3190 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3191 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3196 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3199 int x, startx = span->startx, endx = span->endx;
3200 for (x = startx;x+2 <= endx;x+=2)
3202 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3203 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3204 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3205 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3206 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3210 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3212 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3213 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3214 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3219 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3222 int x, startx = span->startx, endx = span->endx;
3223 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3224 localcolor = _mm_packs_epi32(localcolor, localcolor);
3225 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3226 for (x = startx;x+2 <= endx;x+=2)
3228 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3229 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3230 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3234 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3235 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3236 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3243 void DPSOFTRAST_VertexShader_Generic(void)
3245 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3246 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3247 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3248 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3249 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3252 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3254 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3255 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3256 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3257 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3258 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3259 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3261 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3262 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3263 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3265 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3266 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3269 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3271 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3274 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3276 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3279 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3284 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3285 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3290 void DPSOFTRAST_VertexShader_PostProcess(void)
3292 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3293 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3294 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3297 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3299 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3300 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3301 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3302 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3303 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3304 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3305 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3307 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3308 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3310 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3311 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3313 // TODO: implement saturation
3315 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3317 // TODO: implement gammaramps
3319 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3324 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3326 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3329 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3331 // this is never called (because colormask is off when this shader is used)
3332 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3333 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3335 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3336 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3341 void DPSOFTRAST_VertexShader_FlatColor(void)
3343 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3347 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3350 unsigned char * RESTRICT pixelmask = span->pixelmask;
3351 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352 int x, startx = span->startx, endx = span->endx;
3353 __m128i Color_Ambientm;
3354 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3359 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3360 pixel = buffer_FragColorbgra8;
3361 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3362 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3363 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3364 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3365 for (x = startx;x < endx;x++)
3368 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3371 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3372 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3373 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3374 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3380 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3381 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3382 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3384 if (pixel == buffer_FragColorbgra8)
3385 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3391 void DPSOFTRAST_VertexShader_VertexColor(void)
3393 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3394 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3395 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3398 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3401 unsigned char * RESTRICT pixelmask = span->pixelmask;
3402 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3403 int x, startx = span->startx, endx = span->endx;
3404 __m128i Color_Ambientm, Color_Diffusem;
3406 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3407 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3408 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3410 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3411 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3412 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3413 pixel = buffer_FragColorbgra8;
3414 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3415 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3416 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3417 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3418 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3419 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3420 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3421 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3422 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3423 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3424 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3425 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3426 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3427 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3429 __m128i color, mod, pix;
3430 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3433 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3434 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3435 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3436 data = _mm_add_ps(data, slope);
3437 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3438 data = _mm_add_ps(data, slope);
3439 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3440 data = _mm_add_ps(data, slope);
3441 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3442 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3443 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3444 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3445 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3446 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3452 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3453 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3454 mod = _mm_packs_epi32(mod, mod);
3455 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3456 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3458 if (pixel == buffer_FragColorbgra8)
3459 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3465 void DPSOFTRAST_VertexShader_Lightmap(void)
3467 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3468 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3469 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3472 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3475 unsigned char * RESTRICT pixelmask = span->pixelmask;
3476 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3477 int x, startx = span->startx, endx = span->endx;
3478 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3479 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3480 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3481 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3482 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3483 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3484 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3485 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3486 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3487 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3488 pixel = buffer_FragColorbgra8;
3489 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3490 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3491 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3492 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3493 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3494 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3495 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3496 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3498 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3499 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3500 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3501 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3502 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3503 for (x = startx;x < endx;x++)
3505 __m128i color, lightmap, glow, pix;
3506 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3509 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3510 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3511 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3512 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3513 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3514 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3515 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3516 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3517 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3518 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3524 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3525 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3526 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3527 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3528 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3529 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3534 for (x = startx;x < endx;x++)
3536 __m128i color, lightmap, pix;
3537 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3540 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3541 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3542 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3543 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3544 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3545 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3546 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3552 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3553 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3554 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3555 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3558 if (pixel == buffer_FragColorbgra8)
3559 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3564 void DPSOFTRAST_VertexShader_LightDirection(void);
3565 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3567 void DPSOFTRAST_VertexShader_FakeLight(void)
3569 DPSOFTRAST_VertexShader_LightDirection();
3572 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3574 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3579 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3581 DPSOFTRAST_VertexShader_LightDirection();
3582 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3585 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3587 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3592 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3594 DPSOFTRAST_VertexShader_LightDirection();
3595 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3598 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3600 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3605 void DPSOFTRAST_VertexShader_LightDirection(void)
3608 int numvertices = dpsoftrast.numvertices;
3610 float LightVector[4];
3611 float EyePosition[4];
3612 float EyeVectorModelSpace[4];
3618 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3619 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3620 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3621 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3622 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3623 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3624 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3625 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3626 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3627 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3628 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3629 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3630 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3631 for (i = 0;i < numvertices;i++)
3633 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3634 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3635 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3636 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3637 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3638 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3639 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3640 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3641 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3642 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3643 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3644 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3645 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3646 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3647 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3648 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3649 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3650 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3651 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3652 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3653 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3654 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3655 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3656 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3657 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3658 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3659 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3660 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3661 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3663 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3666 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3667 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3668 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3669 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3670 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3671 #define DPSOFTRAST_Vector3Normalize(v)\
3674 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3685 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3687 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3688 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3689 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3690 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3691 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3692 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3693 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3694 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3695 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3696 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3697 int x, startx = span->startx, endx = span->endx;
3698 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3699 float LightVectordata[4];
3700 float LightVectorslope[4];
3701 float EyeVectordata[4];
3702 float EyeVectorslope[4];
3703 float VectorSdata[4];
3704 float VectorSslope[4];
3705 float VectorTdata[4];
3706 float VectorTslope[4];
3707 float VectorRdata[4];
3708 float VectorRslope[4];
3710 float diffusetex[4];
3712 float surfacenormal[4];
3713 float lightnormal[4];
3714 float lightnormal_modelspace[4];
3716 float specularnormal[4];
3719 float SpecularPower;
3721 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3722 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3723 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3724 Color_Glow[3] = 0.0f;
3725 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3726 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3727 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3728 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3729 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3730 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3731 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3732 Color_Pants[3] = 0.0f;
3733 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3734 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3735 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3736 Color_Shirt[3] = 0.0f;
3737 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3738 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3739 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3741 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3742 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3744 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3746 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3748 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3750 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3751 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3752 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3753 Color_Diffuse[3] = 0.0f;
3754 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3755 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3756 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3757 LightColor[3] = 0.0f;
3758 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3759 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3760 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3761 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3762 Color_Specular[3] = 0.0f;
3763 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3764 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3765 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3767 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3769 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3770 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3771 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3772 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3773 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3775 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3777 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3778 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3780 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3782 // nothing of this needed
3786 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3789 for (x = startx;x < endx;x++)
3792 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3793 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3794 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3795 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3796 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3798 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3799 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3800 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3801 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3803 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3804 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3805 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3806 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3807 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3808 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3809 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3810 DPSOFTRAST_Vector3Normalize(surfacenormal);
3812 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3814 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3815 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3816 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3817 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3819 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3820 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3821 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3822 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3824 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3825 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3826 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3827 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3829 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3830 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3831 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3832 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3834 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3835 DPSOFTRAST_Vector3Normalize(lightnormal);
3837 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3839 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3840 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3841 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3842 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3845 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3847 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3848 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3849 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3851 float f = 1.0f / 256.0f;
3852 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3853 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3854 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3857 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3859 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3860 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3861 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3862 DPSOFTRAST_Vector3Normalize(lightnormal);
3864 LightColor[0] = 1.0;
3865 LightColor[1] = 1.0;
3866 LightColor[2] = 1.0;
3870 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3871 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3872 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3873 DPSOFTRAST_Vector3Normalize(lightnormal);
3876 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3878 if(thread->shader_exactspecularmath)
3880 // reflect lightnormal at surfacenormal, take the negative of that
3881 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3883 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3884 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3885 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3886 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3888 // dot of this and normalize(EyeVectorFogDepth.xyz)
3889 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3890 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3891 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3892 DPSOFTRAST_Vector3Normalize(eyenormal);
3894 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3898 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3899 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3900 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3901 DPSOFTRAST_Vector3Normalize(eyenormal);
3903 specularnormal[0] = lightnormal[0] + eyenormal[0];
3904 specularnormal[1] = lightnormal[1] + eyenormal[1];
3905 specularnormal[2] = lightnormal[2] + eyenormal[2];
3906 DPSOFTRAST_Vector3Normalize(specularnormal);
3908 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3911 specular = pow(specular, SpecularPower * glosstex[3]);
3912 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3914 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3915 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3916 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3917 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3921 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3922 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3923 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3924 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3927 buffer_FragColorbgra8[x*4+0] = d[0];
3928 buffer_FragColorbgra8[x*4+1] = d[1];
3929 buffer_FragColorbgra8[x*4+2] = d[2];
3930 buffer_FragColorbgra8[x*4+3] = d[3];
3933 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3935 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3936 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3937 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3938 Color_Diffuse[3] = 0.0f;
3939 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3940 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3941 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3942 LightColor[3] = 0.0f;
3943 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3945 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3947 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3948 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3949 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3950 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3951 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3953 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3955 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3956 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3958 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3960 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3964 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3967 for (x = startx;x < endx;x++)
3970 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3971 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3972 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3973 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3974 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3975 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3976 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3977 DPSOFTRAST_Vector3Normalize(surfacenormal);
3979 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3981 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3982 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3983 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3984 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3986 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3987 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3988 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3989 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3991 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3992 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3993 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3994 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3996 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3997 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3998 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3999 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4001 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4002 DPSOFTRAST_Vector3Normalize(lightnormal);
4004 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4006 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4007 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4008 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4009 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4012 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4014 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4015 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4016 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4018 float f = 1.0f / 256.0f;
4019 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4020 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4021 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4024 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4026 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4027 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4028 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4029 DPSOFTRAST_Vector3Normalize(lightnormal);
4031 LightColor[0] = 1.0;
4032 LightColor[1] = 1.0;
4033 LightColor[2] = 1.0;
4037 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4038 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4039 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4040 DPSOFTRAST_Vector3Normalize(lightnormal);
4043 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4044 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4046 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4047 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4048 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4049 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4053 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4054 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4055 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4056 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4058 buffer_FragColorbgra8[x*4+0] = d[0];
4059 buffer_FragColorbgra8[x*4+1] = d[1];
4060 buffer_FragColorbgra8[x*4+2] = d[2];
4061 buffer_FragColorbgra8[x*4+3] = d[3];
4066 for (x = startx;x < endx;x++)
4069 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4070 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4071 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4072 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4074 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4076 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4077 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4078 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4079 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4083 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4084 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4085 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4086 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4088 buffer_FragColorbgra8[x*4+0] = d[0];
4089 buffer_FragColorbgra8[x*4+1] = d[1];
4090 buffer_FragColorbgra8[x*4+2] = d[2];
4091 buffer_FragColorbgra8[x*4+3] = d[3];
4094 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4099 void DPSOFTRAST_VertexShader_LightSource(void)
4102 int numvertices = dpsoftrast.numvertices;
4103 float LightPosition[4];
4104 float LightVector[4];
4105 float LightVectorModelSpace[4];
4106 float EyePosition[4];
4107 float EyeVectorModelSpace[4];
4113 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4114 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4115 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4116 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4117 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4118 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4119 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4120 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4121 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4122 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4123 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4124 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4125 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4126 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4127 for (i = 0;i < numvertices;i++)
4129 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4130 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4131 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4132 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4133 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4134 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4135 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4136 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4137 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4138 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4139 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4140 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4141 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4142 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4143 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4144 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4145 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4146 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4147 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4148 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4149 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4150 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4151 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4152 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4153 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4154 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4155 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4156 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4157 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4158 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4159 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4160 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4162 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4163 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4166 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4169 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4170 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4171 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4172 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4173 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4174 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4175 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4176 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4177 int x, startx = span->startx, endx = span->endx;
4178 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4179 float CubeVectordata[4];
4180 float CubeVectorslope[4];
4181 float LightVectordata[4];
4182 float LightVectorslope[4];
4183 float EyeVectordata[4];
4184 float EyeVectorslope[4];
4186 float diffusetex[4];
4188 float surfacenormal[4];
4189 float lightnormal[4];
4191 float specularnormal[4];
4194 float SpecularPower;
4195 float CubeVector[4];
4198 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4199 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4200 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4201 Color_Glow[3] = 0.0f;
4202 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4203 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4204 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4205 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4206 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4207 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4208 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4209 Color_Diffuse[3] = 0.0f;
4210 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4211 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4212 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4213 Color_Specular[3] = 0.0f;
4214 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4215 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4216 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4217 Color_Pants[3] = 0.0f;
4218 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4219 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4220 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4221 Color_Shirt[3] = 0.0f;
4222 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4223 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4224 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4225 LightColor[3] = 0.0f;
4226 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4227 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4228 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4229 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4231 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4232 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4233 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4235 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4236 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4238 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4239 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4240 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4242 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4243 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4244 for (x = startx;x < endx;x++)
4247 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4248 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4249 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4250 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4251 if (attenuation < 0.01f)
4253 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4255 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4256 if (attenuation < 0.01f)
4260 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4261 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4262 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4263 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4264 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4266 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4267 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4268 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4269 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4271 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4272 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4273 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4274 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4275 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4276 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4277 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4278 DPSOFTRAST_Vector3Normalize(surfacenormal);
4280 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4281 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4282 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4283 DPSOFTRAST_Vector3Normalize(lightnormal);
4285 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4287 if(thread->shader_exactspecularmath)
4289 // reflect lightnormal at surfacenormal, take the negative of that
4290 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4292 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4293 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4294 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4295 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4297 // dot of this and normalize(EyeVectorFogDepth.xyz)
4298 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4299 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4300 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4301 DPSOFTRAST_Vector3Normalize(eyenormal);
4303 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4307 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4308 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4309 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4310 DPSOFTRAST_Vector3Normalize(eyenormal);
4312 specularnormal[0] = lightnormal[0] + eyenormal[0];
4313 specularnormal[1] = lightnormal[1] + eyenormal[1];
4314 specularnormal[2] = lightnormal[2] + eyenormal[2];
4315 DPSOFTRAST_Vector3Normalize(specularnormal);
4317 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4319 specular = pow(specular, SpecularPower * glosstex[3]);
4321 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4323 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4324 attenuation *= (1.0f / 255.0f);
4325 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4326 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4327 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4328 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4332 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4333 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4334 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4335 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4337 buffer_FragColorbgra8[x*4+0] = d[0];
4338 buffer_FragColorbgra8[x*4+1] = d[1];
4339 buffer_FragColorbgra8[x*4+2] = d[2];
4340 buffer_FragColorbgra8[x*4+3] = d[3];
4343 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4345 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4346 for (x = startx;x < endx;x++)
4349 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4350 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4351 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4352 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4353 if (attenuation < 0.01f)
4355 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4357 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4358 if (attenuation < 0.01f)
4362 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4363 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4364 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4365 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4366 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4368 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4369 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4370 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4371 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4373 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4374 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4375 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4376 DPSOFTRAST_Vector3Normalize(surfacenormal);
4378 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4379 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4380 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4381 DPSOFTRAST_Vector3Normalize(lightnormal);
4383 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4384 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4386 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4387 attenuation *= (1.0f / 255.0f);
4388 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4389 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4390 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4391 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4395 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4396 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4397 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4398 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4400 buffer_FragColorbgra8[x*4+0] = d[0];
4401 buffer_FragColorbgra8[x*4+1] = d[1];
4402 buffer_FragColorbgra8[x*4+2] = d[2];
4403 buffer_FragColorbgra8[x*4+3] = d[3];
4408 for (x = startx;x < endx;x++)
4411 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4412 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4413 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4414 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4415 if (attenuation < 0.01f)
4417 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4419 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4420 if (attenuation < 0.01f)
4424 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4425 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4426 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4427 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4428 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4430 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4431 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4432 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4433 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4435 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4437 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4438 attenuation *= (1.0f / 255.0f);
4439 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4440 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4441 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4442 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4446 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4447 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4448 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4449 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4451 buffer_FragColorbgra8[x*4+0] = d[0];
4452 buffer_FragColorbgra8[x*4+1] = d[1];
4453 buffer_FragColorbgra8[x*4+2] = d[2];
4454 buffer_FragColorbgra8[x*4+3] = d[3];
4457 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4463 void DPSOFTRAST_VertexShader_Refraction(void)
4465 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4466 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4467 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4470 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4472 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4474 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4476 int x, startx = span->startx, endx = span->endx;
4479 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4480 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4483 float ModelViewProjectionPositiondata[4];
4484 float ModelViewProjectionPositionslope[4];
4487 float ScreenScaleRefractReflect[2];
4488 float ScreenCenterRefractReflect[2];
4489 float DistortScaleRefractReflect[2];
4490 float RefractColor[4];
4492 const unsigned char * RESTRICT pixelbase;
4493 const unsigned char * RESTRICT pixel[4];
4494 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4495 if(!texture) return;
4496 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4499 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4500 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4503 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4506 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4507 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4508 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4509 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4510 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4511 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4512 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4513 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4514 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4515 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4518 for (x = startx;x < endx;x++)
4520 float SafeScreenTexCoord[2];
4521 float ScreenTexCoord[2];
4528 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4529 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4531 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4532 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4533 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4535 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4536 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4537 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4538 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4539 DPSOFTRAST_Vector3Normalize(v);
4540 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4541 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4543 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4544 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4546 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4547 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4548 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4549 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4550 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4551 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4552 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4553 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4554 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4555 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4556 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4557 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4558 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4559 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4560 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4561 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4562 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4566 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4567 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4568 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4569 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4575 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4576 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4577 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4578 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4579 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4582 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4587 void DPSOFTRAST_VertexShader_Water(void)
4589 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4593 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4596 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4597 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4598 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4599 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4600 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4605 void DPSOFTRAST_VertexShader_ShowDepth(void)
4607 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4610 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4613 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4614 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4615 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4616 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4617 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4622 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4624 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4627 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4630 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4631 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4632 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4633 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4634 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4639 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4641 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4644 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4647 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4648 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4649 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4650 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4651 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4656 typedef struct DPSOFTRAST_ShaderModeInfo_s
4659 void (*Vertex)(void);
4660 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4661 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4662 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4664 DPSOFTRAST_ShaderModeInfo;
4666 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4668 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4669 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4670 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4671 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4672 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4673 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4674 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4675 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4676 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4677 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4678 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4679 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4680 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4681 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4682 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4683 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4686 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4693 // unsigned int *colorpixel;
4694 unsigned int *depthpixel;
4700 DPSOFTRAST_State_Triangle *triangle;
4701 DPSOFTRAST_State_Span *span;
4702 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4703 for (i = 0; i < thread->numspans; i++)
4705 span = &thread->spans[i];
4706 triangle = &thread->triangles[span->triangle];
4707 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4709 wslope = triangle->w[0];
4710 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4711 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4712 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4713 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4714 startx = span->startx;
4716 switch(thread->fb_depthfunc)
4719 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4720 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4721 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4722 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4723 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4724 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4725 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4727 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4728 //for (x = startx;x < endx;x++)
4729 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4730 // if there is no color buffer, skip pixel shader
4731 while (startx < endx && !pixelmask[startx])
4733 while (endx > startx && !pixelmask[endx-1])
4736 continue; // no pixels to fill
4737 span->pixelmask = pixelmask;
4738 span->startx = startx;
4740 // run pixel shader if appropriate
4741 // do this before running depthmask code, to allow the pixelshader
4742 // to clear pixelmask values for alpha testing
4743 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4744 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4745 if (thread->depthmask)
4746 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4752 // no depth testing means we're just dealing with color...
4753 // if there is no color buffer, skip pixel shader
4754 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4756 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4757 span->pixelmask = pixelmask;
4758 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4762 thread->numspans = 0;
4765 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4767 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4770 int cullface = thread->cullface;
4771 int minx, maxx, miny, maxy;
4772 int miny1, maxy1, miny2, maxy2;
4773 __m128i fbmin, fbmax;
4774 __m128 viewportcenter, viewportscale;
4775 int firstvertex = command->firstvertex;
4776 int numvertices = command->numvertices;
4777 int numtriangles = command->numtriangles;
4778 const int *element3i = command->element3i;
4779 const unsigned short *element3s = command->element3s;
4780 int clipped = command->clipped;
4787 int starty, endy, bandy;
4791 __m128 triangleedge1, triangleedge2, trianglenormal;
4794 DPSOFTRAST_State_Triangle *triangle;
4795 DPSOFTRAST_Texture *texture;
4796 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4797 miny = thread->fb_scissor[1];
4798 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4799 miny1 = bound(miny, thread->miny1, maxy);
4800 maxy1 = bound(miny, thread->maxy1, maxy);
4801 miny2 = bound(miny, thread->miny2, maxy);
4802 maxy2 = bound(miny, thread->maxy2, maxy);
4803 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4805 if (!ATOMIC_DECREMENT(command->refcount))
4807 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4808 MM_FREE(command->arrays);
4812 minx = thread->fb_scissor[0];
4813 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4814 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4815 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4816 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4817 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4818 screen[3] = _mm_setzero_ps();
4819 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4820 for (i = 0;i < numtriangles;i++)
4822 const float *screencoord4f = command->arrays;
4823 const float *arrays = screencoord4f + numvertices*4;
4825 // generate the 3 edges of this triangle
4826 // generate spans for the triangle - switch based on left split or right split classification of triangle
4829 e[0] = element3s[i*3+0] - firstvertex;
4830 e[1] = element3s[i*3+1] - firstvertex;
4831 e[2] = element3s[i*3+2] - firstvertex;
4835 e[0] = element3i[i*3+0] - firstvertex;
4836 e[1] = element3i[i*3+1] - firstvertex;
4837 e[2] = element3i[i*3+2] - firstvertex;
4846 #define SKIPBACKFACE \
4847 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4848 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4849 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4850 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4851 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4855 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4859 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4864 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4865 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4867 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4868 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4870 #define CLIPPEDVERTEXCOPY(k,p1) \
4871 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4873 #define GENATTRIBCOPY(attrib, p1) \
4874 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4875 #define GENATTRIBLERP(attrib, p1, p2) \
4877 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4878 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4880 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4884 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4885 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4886 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4887 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4888 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4889 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4890 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4896 // calculate distance from nearplane
4897 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4898 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4899 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4900 if (clipdist[0] >= 0.0f)
4902 if (clipdist[1] >= 0.0f)
4904 if (clipdist[2] >= 0.0f)
4907 // triangle is entirely in front of nearplane
4908 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4915 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4923 if (clipdist[2] >= 0.0f)
4925 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4932 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4939 else if (clipdist[1] >= 0.0f)
4941 if (clipdist[2] >= 0.0f)
4943 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4950 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4956 else if (clipdist[2] >= 0.0f)
4958 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4963 else continue; // triangle is entirely behind nearplane
4966 // calculate integer y coords for triangle points
4967 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4968 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4969 screenmin = _mm_min_epi16(screeni, screenir),
4970 screenmax = _mm_max_epi16(screeni, screenir);
4971 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4972 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4973 screenmin = _mm_max_epi16(screenmin, fbmin);
4974 screenmax = _mm_min_epi16(screenmax, fbmax);
4975 // skip offscreen triangles
4976 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4978 starty = _mm_extract_epi16(screenmin, 1);
4979 endy = _mm_extract_epi16(screenmax, 1)+1;
4980 if (starty >= maxy1 && endy <= miny2)
4982 screeny = _mm_srai_epi32(screeni, 16);
4985 triangle = &thread->triangles[thread->numtriangles];
4987 // calculate attribute plans for triangle data...
4988 // okay, this triangle is going to produce spans, we'd better project
4989 // the interpolants now (this is what gives perspective texturing),
4990 // this consists of simply multiplying all arrays by the W coord
4991 // (which is basically 1/Z), which will be undone per-pixel
4992 // (multiplying by Z again) to get the perspective-correct array
4995 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4996 __m128 mipedgescale, mipdensity;
4997 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4998 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4999 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5000 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5001 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5002 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5003 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5004 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5005 attribedge1 = _mm_sub_ss(w0, w1);
5006 attribedge2 = _mm_sub_ss(w2, w1);
5007 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5008 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5009 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5010 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5011 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5012 _mm_store_ss(&triangle->w[0], attribxslope);
5013 _mm_store_ss(&triangle->w[1], attribyslope);
5014 _mm_store_ss(&triangle->w[2], attriborigin);
5015 mipedgescale = _mm_setzero_ps();
5016 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5018 __m128 attrib0, attrib1, attrib2;
5019 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5020 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5022 arrays += numvertices*4;
5023 GENATTRIBS(attrib0, attrib1, attrib2);
5024 attriborigin = _mm_mul_ps(attrib1, w1);
5025 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5026 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5027 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5028 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5029 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5030 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5031 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5032 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5033 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5035 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5036 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5037 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5038 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5042 memset(triangle->mip, 0, sizeof(triangle->mip));
5043 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5045 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5046 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5048 texture = thread->texbound[texunit];
5049 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5051 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5052 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5053 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5054 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5055 // this will be multiplied in the texturing routine by the texture resolution
5056 y = _mm_cvtss_si32(mipdensity);
5059 y = (int)(log((float)y)*0.5f/M_LN2);
5060 if (y > texture->mipmaps - 1)
5061 y = texture->mipmaps - 1;
5062 triangle->mip[texunit] = y;
5068 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5071 __m128 xcoords, xslope;
5072 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5073 int yccmask = _mm_movemask_epi8(ycc);
5074 int edge0p, edge0n, edge1p, edge1n;
5081 case 0xFFFF: /*0000*/ y = endy; continue;
5082 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5083 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5084 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5085 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5086 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5087 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5088 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5089 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5090 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5091 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5092 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5093 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5094 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5095 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5096 case 0x0000: /*1111*/ y++; continue;
5104 case 0xFFFF: /*000*/ y = endy; continue;
5105 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5106 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5107 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5108 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5109 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5110 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5111 case 0x0000: /*111*/ y++; continue;
5114 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5115 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5116 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5117 nexty = _mm_extract_epi16(ycc, 0);
5118 if (nexty >= bandy) nexty = bandy-1;
5119 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5120 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5121 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5122 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5123 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5124 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5126 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5127 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5129 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5131 int startx, endx, offset;
5132 startx = _mm_cvtss_si32(xcoords);
5133 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5136 if (startx < 0) startx = 0;
5137 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5139 if (endx > maxx) endx = maxx;
5140 if (startx >= endx) continue;
5141 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5143 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5144 span->triangle = thread->numtriangles;
5147 span->startx = max(minx - offset, 0);
5148 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5149 if (span->startx >= span->endx)
5151 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5152 DPSOFTRAST_Draw_ProcessSpans(thread);
5157 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5159 DPSOFTRAST_Draw_ProcessSpans(thread);
5160 thread->numtriangles = 0;
5164 if (!ATOMIC_DECREMENT(command->refcount))
5166 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5167 MM_FREE(command->arrays);
5170 if (thread->numspans > 0 || thread->numtriangles > 0)
5172 DPSOFTRAST_Draw_ProcessSpans(thread);
5173 thread->numtriangles = 0;
5178 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5182 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5183 int datasize = 2*numvertices*sizeof(float[4]);
5184 DPSOFTRAST_Command_Draw *command;
5185 unsigned char *data;
5186 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5188 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5189 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5191 datasize += numvertices*sizeof(float[4]);
5194 datasize += numtriangles*sizeof(unsigned short[3]);
5196 datasize += numtriangles*sizeof(int[3]);
5197 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5198 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5200 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5201 data = (unsigned char *)MM_CALLOC(datasize, 1);
5205 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5206 data = (unsigned char *)command + commandsize;
5208 command->firstvertex = firstvertex;
5209 command->numvertices = numvertices;
5210 command->numtriangles = numtriangles;
5211 command->arrays = (float *)data;
5212 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5213 dpsoftrast.firstvertex = firstvertex;
5214 dpsoftrast.numvertices = numvertices;
5215 dpsoftrast.screencoord4f = (float *)data;
5216 data += numvertices*sizeof(float[4]);
5217 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5218 data += numvertices*sizeof(float[4]);
5219 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5221 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5222 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5224 dpsoftrast.post_array4f[j] = (float *)data;
5225 data += numvertices*sizeof(float[4]);
5227 command->element3i = NULL;
5228 command->element3s = NULL;
5231 command->element3s = (unsigned short *)data;
5232 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5236 command->element3i = (int *)data;
5237 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5242 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5244 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5245 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5246 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5247 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5248 if (command->starty >= command->endy)
5250 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5251 MM_FREE(command->arrays);
5252 DPSOFTRAST_UndoCommand(command->commandsize);
5255 command->clipped = dpsoftrast.drawclipped;
5256 command->refcount = dpsoftrast.numthreads;
5258 if (dpsoftrast.usethreads)
5261 DPSOFTRAST_Draw_SyncCommands();
5262 for (i = 0; i < dpsoftrast.numthreads; i++)
5264 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5265 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5266 Thread_CondSignal(thread->drawcond);
5271 DPSOFTRAST_Draw_FlushThreads();
5275 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5276 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5278 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5280 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5282 DPSOFTRAST_Command_SetRenderTargets *command;
5283 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5284 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5285 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5287 dpsoftrast.fb_width = width;
5288 dpsoftrast.fb_height = height;
5289 dpsoftrast.fb_depthpixels = depthpixels;
5290 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5291 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5292 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5293 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5294 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5295 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5296 command->width = width;
5297 command->height = height;
5300 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5302 int commandoffset = thread->commandoffset;
5303 while (commandoffset != endoffset)
5305 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5306 switch (command->opcode)
5308 #define INTERPCOMMAND(name) \
5309 case DPSOFTRAST_OPCODE_##name : \
5310 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5311 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5312 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5313 commandoffset = 0; \
5315 INTERPCOMMAND(Viewport)
5316 INTERPCOMMAND(ClearColor)
5317 INTERPCOMMAND(ClearDepth)
5318 INTERPCOMMAND(ColorMask)
5319 INTERPCOMMAND(DepthTest)
5320 INTERPCOMMAND(ScissorTest)
5321 INTERPCOMMAND(Scissor)
5322 INTERPCOMMAND(BlendFunc)
5323 INTERPCOMMAND(BlendSubtract)
5324 INTERPCOMMAND(DepthMask)
5325 INTERPCOMMAND(DepthFunc)
5326 INTERPCOMMAND(DepthRange)
5327 INTERPCOMMAND(PolygonOffset)
5328 INTERPCOMMAND(CullFace)
5329 INTERPCOMMAND(AlphaTest)
5330 INTERPCOMMAND(AlphaFunc)
5331 INTERPCOMMAND(SetTexture)
5332 INTERPCOMMAND(SetShader)
5333 INTERPCOMMAND(Uniform4f)
5334 INTERPCOMMAND(UniformMatrix4f)
5335 INTERPCOMMAND(Uniform1i)
5336 INTERPCOMMAND(SetRenderTargets)
5338 case DPSOFTRAST_OPCODE_Draw:
5339 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5340 commandoffset += command->commandsize;
5341 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5343 thread->commandoffset = commandoffset;
5346 case DPSOFTRAST_OPCODE_Reset:
5351 thread->commandoffset = commandoffset;
5354 static int DPSOFTRAST_Draw_Thread(void *data)
5356 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5357 while(thread->index >= 0)
5359 if (thread->commandoffset != dpsoftrast.drawcommand)
5361 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5365 Thread_LockMutex(thread->drawmutex);
5366 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5368 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5369 thread->starving = true;
5370 Thread_CondWait(thread->drawcond, thread->drawmutex);
5371 thread->starving = false;
5373 Thread_UnlockMutex(thread->drawmutex);
5379 static void DPSOFTRAST_Draw_FlushThreads(void)
5381 DPSOFTRAST_State_Thread *thread;
5383 DPSOFTRAST_Draw_SyncCommands();
5384 if (dpsoftrast.usethreads)
5386 for (i = 0; i < dpsoftrast.numthreads; i++)
5388 thread = &dpsoftrast.threads[i];
5389 if (thread->commandoffset != dpsoftrast.drawcommand)
5391 Thread_LockMutex(thread->drawmutex);
5392 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5393 Thread_CondSignal(thread->drawcond);
5394 Thread_UnlockMutex(thread->drawmutex);
5397 for (i = 0; i < dpsoftrast.numthreads; i++)
5399 thread = &dpsoftrast.threads[i];
5400 if (thread->commandoffset != dpsoftrast.drawcommand)
5402 Thread_LockMutex(thread->drawmutex);
5403 if (thread->commandoffset != dpsoftrast.drawcommand)
5405 thread->waiting = true;
5406 Thread_CondWait(thread->waitcond, thread->drawmutex);
5407 thread->waiting = false;
5409 Thread_UnlockMutex(thread->drawmutex);
5415 for (i = 0; i < dpsoftrast.numthreads; i++)
5417 thread = &dpsoftrast.threads[i];
5418 if (thread->commandoffset != dpsoftrast.drawcommand)
5419 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5422 dpsoftrast.commandpool.usedcommands = 0;
5425 void DPSOFTRAST_Flush(void)
5427 DPSOFTRAST_Draw_FlushThreads();
5430 void DPSOFTRAST_Finish(void)
5435 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5445 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5446 dpsoftrast.bigendian = u.b[3];
5447 dpsoftrast.fb_width = width;
5448 dpsoftrast.fb_height = height;
5449 dpsoftrast.fb_depthpixels = depthpixels;
5450 dpsoftrast.fb_colorpixels[0] = colorpixels;
5451 dpsoftrast.fb_colorpixels[1] = NULL;
5452 dpsoftrast.fb_colorpixels[1] = NULL;
5453 dpsoftrast.fb_colorpixels[1] = NULL;
5454 dpsoftrast.viewport[0] = 0;
5455 dpsoftrast.viewport[1] = 0;
5456 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5457 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5458 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5459 dpsoftrast.texture_firstfree = 1;
5460 dpsoftrast.texture_end = 1;
5461 dpsoftrast.texture_max = 0;
5462 dpsoftrast.color[0] = 1;
5463 dpsoftrast.color[1] = 1;
5464 dpsoftrast.color[2] = 1;
5465 dpsoftrast.color[3] = 1;
5466 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5467 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5468 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5469 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5470 for (i = 0; i < dpsoftrast.numthreads; i++)
5472 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5474 thread->cullface = GL_BACK;
5475 thread->colormask[1] = 1;
5476 thread->colormask[2] = 1;
5477 thread->colormask[3] = 1;
5478 thread->blendfunc[0] = GL_ONE;
5479 thread->blendfunc[1] = GL_ZERO;
5480 thread->depthmask = true;
5481 thread->depthtest = true;
5482 thread->depthfunc = GL_LEQUAL;
5483 thread->scissortest = false;
5484 thread->alphatest = false;
5485 thread->alphafunc = GL_GREATER;
5486 thread->alphavalue = 0.5f;
5487 thread->viewport[0] = 0;
5488 thread->viewport[1] = 0;
5489 thread->viewport[2] = dpsoftrast.fb_width;
5490 thread->viewport[3] = dpsoftrast.fb_height;
5491 thread->scissor[0] = 0;
5492 thread->scissor[1] = 0;
5493 thread->scissor[2] = dpsoftrast.fb_width;
5494 thread->scissor[3] = dpsoftrast.fb_height;
5495 thread->depthrange[0] = 0;
5496 thread->depthrange[1] = 1;
5497 thread->polygonoffset[0] = 0;
5498 thread->polygonoffset[1] = 0;
5500 DPSOFTRAST_RecalcThread(thread);
5502 thread->numspans = 0;
5503 thread->numtriangles = 0;
5504 thread->commandoffset = 0;
5505 thread->waiting = false;
5506 thread->starving = false;
5508 thread->validate = -1;
5509 DPSOFTRAST_Validate(thread, -1);
5511 if (dpsoftrast.usethreads)
5513 thread->waitcond = Thread_CreateCond();
5514 thread->drawcond = Thread_CreateCond();
5515 thread->drawmutex = Thread_CreateMutex();
5516 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5522 void DPSOFTRAST_Shutdown(void)
5525 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5527 DPSOFTRAST_State_Thread *thread;
5528 for (i = 0; i < dpsoftrast.numthreads; i++)
5530 thread = &dpsoftrast.threads[i];
5531 Thread_LockMutex(thread->drawmutex);
5533 Thread_CondSignal(thread->drawcond);
5534 Thread_UnlockMutex(thread->drawmutex);
5535 Thread_WaitThread(thread->thread, 0);
5536 Thread_DestroyCond(thread->waitcond);
5537 Thread_DestroyCond(thread->drawcond);
5538 Thread_DestroyMutex(thread->drawmutex);
5541 for (i = 0;i < dpsoftrast.texture_end;i++)
5542 if (dpsoftrast.texture[i].bytes)
5543 MM_FREE(dpsoftrast.texture[i].bytes);
5544 if (dpsoftrast.texture)
5545 free(dpsoftrast.texture);
5546 if (dpsoftrast.threads)
5547 MM_FREE(dpsoftrast.threads);
5548 memset(&dpsoftrast, 0, sizeof(dpsoftrast));