3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
240 int shader_permutation;
241 int shader_exactspecularmath;
243 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248 // DPSOFTRAST_VALIDATE_ flags
251 // derived values (DPSOFTRAST_VALIDATE_FB)
254 ALIGN(float fb_viewportcenter[4]);
255 ALIGN(float fb_viewportscale[4]);
257 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
260 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
269 ATOMIC(volatile int commandoffset);
271 volatile bool waiting;
272 volatile bool starving;
279 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 DPSOFTRAST_State_Thread);
284 typedef ATOMIC(struct DPSOFTRAST_State_s
288 unsigned int *fb_depthpixels;
289 unsigned int *fb_colorpixels[4];
292 ALIGN(float fb_viewportcenter[4]);
293 ALIGN(float fb_viewportscale[4]);
296 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299 const float *pointer_vertex3f;
300 const float *pointer_color4f;
301 const unsigned char *pointer_color4ub;
302 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
305 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
311 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312 float *screencoord4f;
318 int shader_permutation;
319 int shader_exactspecularmath;
323 int texture_firstfree;
324 DPSOFTRAST_Texture *texture;
329 const char *errorstring;
334 DPSOFTRAST_State_Thread *threads;
336 ATOMIC(volatile int drawcommand);
338 DPSOFTRAST_State_Command_Pool commandpool;
342 DPSOFTRAST_State dpsoftrast;
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354 fb_viewportcenter[3] = 0.5f;
355 fb_viewportcenter[0] = 0.0f;
356 fb_viewportscale[1] = 0.5f * viewport[2];
357 fb_viewportscale[2] = -0.5f * viewport[3];
358 fb_viewportscale[3] = 0.5f;
359 fb_viewportscale[0] = 1.0f;
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 if (dpsoftrast.interlace)
366 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
373 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381 // and viewport projection values
384 x1 = thread->scissor[0];
385 x2 = thread->scissor[0] + thread->scissor[2];
386 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387 y2 = dpsoftrast.fb_height - thread->scissor[1];
388 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393 thread->fb_scissor[0] = x1;
394 thread->fb_scissor[1] = y1;
395 thread->fb_scissor[2] = x2 - x1;
396 thread->fb_scissor[3] = y2 - y1;
398 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399 DPSOFTRAST_RecalcThread(thread);
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 if (thread->blendsubtract)
411 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
421 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 mask &= thread->validate;
445 if (mask & DPSOFTRAST_VALIDATE_FB)
447 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448 DPSOFTRAST_RecalcFB(thread);
450 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453 DPSOFTRAST_RecalcDepthFunc(thread);
455 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458 DPSOFTRAST_RecalcBlendFunc(thread);
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465 return &dpsoftrast.texture[index];
469 static void DPSOFTRAST_Texture_Grow(void)
471 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472 DPSOFTRAST_State_Thread *thread;
476 // expand texture array as needed
477 if (dpsoftrast.texture_max < 1024)
478 dpsoftrast.texture_max = 1024;
480 dpsoftrast.texture_max *= 2;
481 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483 if (dpsoftrast.texbound[i])
484 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485 for (j = 0; j < dpsoftrast.numthreads; j++)
487 thread = &dpsoftrast.threads[j];
488 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489 if (thread->texbound[i])
490 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
503 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505 DPSOFTRAST_Texture *texture;
506 if (width*height*depth < 1)
508 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
511 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
518 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
533 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
540 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
545 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
550 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
560 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
565 // find first empty slot in texture array
566 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567 if (!dpsoftrast.texture[texnum].bytes)
569 dpsoftrast.texture_firstfree = texnum + 1;
570 if (dpsoftrast.texture_max <= texnum)
571 DPSOFTRAST_Texture_Grow();
572 if (dpsoftrast.texture_end <= texnum)
573 dpsoftrast.texture_end = texnum + 1;
574 texture = &dpsoftrast.texture[texnum];
575 memset(texture, 0, sizeof(*texture));
576 texture->flags = flags;
577 texture->width = width;
578 texture->height = height;
579 texture->depth = depth;
580 texture->sides = sides;
592 s = w * h * d * sides * 4;
593 texture->mipmap[mipmaps][0] = size;
594 texture->mipmap[mipmaps][1] = s;
595 texture->mipmap[mipmaps][2] = w;
596 texture->mipmap[mipmaps][3] = h;
597 texture->mipmap[mipmaps][4] = d;
600 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
606 texture->mipmaps = mipmaps;
607 texture->size = size;
609 // allocate the pixels now
610 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
614 void DPSOFTRAST_Texture_Free(int index)
616 DPSOFTRAST_Texture *texture;
617 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
621 MM_FREE(texture->bytes);
622 texture->bytes = NULL;
623 memset(texture, 0, sizeof(*texture));
624 // adjust the free range and used range
625 if (dpsoftrast.texture_firstfree > index)
626 dpsoftrast.texture_firstfree = index;
627 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628 dpsoftrast.texture_end--;
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 int i, x, y, z, w, layer0, layer1, row0, row1;
633 unsigned char *o, *i0, *i1, *i2, *i3;
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636 if (texture->mipmaps <= 1)
638 for (i = 1;i < texture->mipmaps;i++)
640 for (z = 0;z < texture->mipmap[i][4];z++)
644 if (layer1 >= texture->mipmap[i-1][4])
645 layer1 = texture->mipmap[i-1][4]-1;
646 for (y = 0;y < texture->mipmap[i][3];y++)
650 if (row1 >= texture->mipmap[i-1][3])
651 row1 = texture->mipmap[i-1][3]-1;
652 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
653 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657 w = texture->mipmap[i][2];
660 if (texture->mipmap[i-1][2] > 1)
662 // average 3D texture
663 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
673 // average 3D mipmap with parent width == 1
674 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
685 if (texture->mipmap[i-1][2] > 1)
687 // average 2D texture (common case)
688 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
698 // 2D texture with parent width == 1
699 o[0] = (i0[0] + i1[0] + 1) >> 1;
700 o[1] = (i0[1] + i1[1] + 1) >> 1;
701 o[2] = (i0[2] + i1[2] + 1) >> 1;
702 o[3] = (i0[3] + i1[3] + 1) >> 1;
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 DPSOFTRAST_Texture *texture;
713 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
718 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719 while (blockheight > 0)
721 memcpy(dst, pixels, blockwidth * 4);
722 pixels += blockwidth * 4;
723 dst += texture->mipmap[0][2] * 4;
727 DPSOFTRAST_Texture_CalculateMipmaps(index);
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737 DPSOFTRAST_Texture_CalculateMipmaps(index);
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 DPSOFTRAST_Texture *texture;
742 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743 return texture->mipmap[mip][2];
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 DPSOFTRAST_Texture *texture;
748 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749 return texture->mipmap[mip][3];
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755 return texture->mipmap[mip][4];
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
763 return texture->bytes + texture->mipmap[mip][0];
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 DPSOFTRAST_Texture *texture;
768 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
776 texture->filter = filter;
779 static void DPSOFTRAST_Draw_FlushThreads(void);
781 static void DPSOFTRAST_Draw_SyncCommands(void)
783 if(dpsoftrast.usethreads) MEMORY_BARRIER;
784 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 DPSOFTRAST_State_Thread *thread;
791 int freecommand = dpsoftrast.commandpool.freecommand;
792 int usedcommands = dpsoftrast.commandpool.usedcommands;
793 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795 DPSOFTRAST_Draw_SyncCommands();
801 for (i = 0; i < dpsoftrast.numthreads; i++)
803 thread = &dpsoftrast.threads[i];
804 commandoffset = freecommand - thread->commandoffset;
805 if (commandoffset < 0)
806 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807 if (commandoffset > usedcommands)
810 usedcommands = commandoffset;
813 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815 thread = &dpsoftrast.threads[waitindex];
816 Thread_LockMutex(thread->drawmutex);
817 if (thread->commandoffset != dpsoftrast.drawcommand)
819 thread->waiting = true;
820 if (thread->starving) Thread_CondSignal(thread->drawcond);
821 Thread_CondWait(thread->waitcond, thread->drawmutex);
822 thread->waiting = false;
824 Thread_UnlockMutex(thread->drawmutex);
826 dpsoftrast.commandpool.usedcommands = usedcommands;
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 DPSOFTRAST_Command *command;
837 int freecommand = dpsoftrast.commandpool.freecommand;
838 int usedcommands = dpsoftrast.commandpool.usedcommands;
839 int extra = sizeof(DPSOFTRAST_Command);
840 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844 if (dpsoftrast.usethreads)
845 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847 DPSOFTRAST_Draw_FlushThreads();
848 freecommand = dpsoftrast.commandpool.freecommand;
849 usedcommands = dpsoftrast.commandpool.usedcommands;
851 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854 command->opcode = DPSOFTRAST_OPCODE_Reset;
855 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
858 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859 command->opcode = opcode;
860 command->commandsize = size;
862 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864 dpsoftrast.commandpool.freecommand = freecommand;
865 dpsoftrast.commandpool.usedcommands = usedcommands + size;
869 static void DPSOFTRAST_UndoCommand(int size)
871 int freecommand = dpsoftrast.commandpool.freecommand;
872 int usedcommands = dpsoftrast.commandpool.usedcommands;
875 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876 usedcommands -= size;
877 dpsoftrast.commandpool.freecommand = freecommand;
878 dpsoftrast.commandpool.usedcommands = usedcommands;
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 thread->viewport[0] = command->x;
885 thread->viewport[1] = command->y;
886 thread->viewport[2] = command->width;
887 thread->viewport[3] = command->height;
888 thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
895 command->width = width;
896 command->height = height;
898 dpsoftrast.viewport[0] = x;
899 dpsoftrast.viewport[1] = y;
900 dpsoftrast.viewport[2] = width;
901 dpsoftrast.viewport[3] = height;
902 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 int i, x1, y1, x2, y2, w, h, x, y;
909 int miny1, maxy1, miny2, maxy2;
913 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914 miny1 = thread->miny1;
915 maxy1 = thread->maxy1;
916 miny2 = thread->miny2;
917 maxy2 = thread->maxy2;
918 x1 = thread->fb_scissor[0];
919 y1 = thread->fb_scissor[1];
920 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922 if (y1 < miny1) y1 = miny1;
923 if (y2 > maxy2) y2 = maxy2;
928 // FIXME: honor fb_colormask?
929 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930 for (i = 0;i < 4;i++)
932 if (!dpsoftrast.fb_colorpixels[i])
934 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
937 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938 for (x = x1;x < x2;x++)
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 int x1, y1, x2, y2, w, h, x, y;
956 int miny1, maxy1, miny2, maxy2;
960 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961 miny1 = thread->miny1;
962 maxy1 = thread->maxy1;
963 miny2 = thread->miny2;
964 maxy2 = thread->maxy2;
965 x1 = thread->fb_scissor[0];
966 y1 = thread->fb_scissor[1];
967 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969 if (y1 < miny1) y1 = miny1;
970 if (y2 > maxy2) y2 = maxy2;
975 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
984 void DPSOFTRAST_ClearDepth(float d)
986 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 thread->colormask[0] = command->r != 0;
994 thread->colormask[1] = command->g != 0;
995 thread->colormask[2] = command->b != 0;
996 thread->colormask[3] = command->a != 0;
997 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 thread->depthtest = command->enable;
1012 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 void DPSOFTRAST_DepthTest(int enable)
1016 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017 command->enable = enable;
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 thread->scissortest = command->enable;
1024 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 void DPSOFTRAST_ScissorTest(int enable)
1028 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029 command->enable = enable;
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 thread->scissor[0] = command->x;
1036 thread->scissor[1] = command->y;
1037 thread->scissor[2] = command->width;
1038 thread->scissor[3] = command->height;
1039 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1046 command->width = width;
1047 command->height = height;
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 thread->blendfunc[0] = command->sfactor;
1054 thread->blendfunc[1] = command->dfactor;
1055 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060 command->sfactor = sfactor;
1061 command->dfactor = dfactor;
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 thread->blendsubtract = command->enable;
1068 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 void DPSOFTRAST_BlendSubtract(int enable)
1072 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073 command->enable = enable;
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 thread->depthmask = command->enable;
1081 void DPSOFTRAST_DepthMask(int enable)
1083 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084 command->enable = enable;
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 thread->depthfunc = command->func;
1092 void DPSOFTRAST_DepthFunc(int func)
1094 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095 command->func = func;
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 thread->depthrange[0] = command->nearval;
1102 thread->depthrange[1] = command->farval;
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107 command->nearval = nearval;
1108 command->farval = farval;
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 thread->polygonoffset[0] = command->alongnormal;
1115 thread->polygonoffset[1] = command->intoview;
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120 command->alongnormal = alongnormal;
1121 command->intoview = intoview;
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 thread->cullface = command->mode;
1129 void DPSOFTRAST_CullFace(int mode)
1131 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132 command->mode = mode;
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 thread->alphatest = command->enable;
1140 void DPSOFTRAST_AlphaTest(int enable)
1142 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143 command->enable = enable;
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 thread->alphafunc = command->func;
1150 thread->alphavalue = command->ref;
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155 command->func = func;
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 dpsoftrast.color[0] = r;
1162 dpsoftrast.color[1] = g;
1163 dpsoftrast.color[2] = b;
1164 dpsoftrast.color[3] = a;
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 int outstride = blockwidth * 4;
1170 int instride = dpsoftrast.fb_width * 4;
1173 int bx2 = blockx + blockwidth;
1174 int by2 = blocky + blockheight;
1178 unsigned char *inpixels;
1182 if (bx1 < 0) bx1 = 0;
1183 if (by1 < 0) by1 = 0;
1184 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188 if (dpsoftrast.bigendian)
1190 for (y = by1;y < by2;y++)
1192 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193 o = (unsigned char *)outpixels + (y - by1) * outstride;
1194 for (x = bx1;x < bx2;x++)
1207 for (y = by1;y < by2;y++)
1209 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1220 int tx2 = tx + width;
1221 int ty2 = ty + height;
1224 int sx2 = sx + width;
1225 int sy2 = sy + height;
1235 unsigned int *spixels;
1236 unsigned int *tpixels;
1237 DPSOFTRAST_Texture *texture;
1238 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239 if (mip < 0 || mip >= texture->mipmaps) return;
1241 spixels = dpsoftrast.fb_colorpixels[0];
1242 swidth = dpsoftrast.fb_width;
1243 sheight = dpsoftrast.fb_height;
1244 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245 twidth = texture->mipmap[mip][2];
1246 theight = texture->mipmap[mip][3];
1247 if (tx1 < 0) tx1 = 0;
1248 if (ty1 < 0) ty1 = 0;
1249 if (tx2 > twidth) tx2 = twidth;
1250 if (ty2 > theight) ty2 = theight;
1251 if (sx1 < 0) sx1 = 0;
1252 if (sy1 < 0) sy1 = 0;
1253 if (sx2 > swidth) sx2 = swidth;
1254 if (sy2 > sheight) sy2 = sheight;
1259 if (tw > sw) tw = sw;
1260 if (th > sh) th = sh;
1261 if (tw < 1 || th < 1)
1263 sy1 = sheight - 1 - sy1;
1264 for (y = 0;y < th;y++)
1265 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266 if (texture->mipmaps > 1)
1267 DPSOFTRAST_Texture_CalculateMipmaps(index);
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 if (thread->texbound[command->unitnum])
1274 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275 thread->texbound[command->unitnum] = command->texture;
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 DPSOFTRAST_Command_SetTexture *command;
1280 DPSOFTRAST_Texture *texture;
1281 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1286 texture = DPSOFTRAST_Texture_GetByIndex(index);
1287 if (index && !texture)
1289 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1293 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294 command->unitnum = unitnum;
1295 command->texture = texture;
1297 dpsoftrast.texbound[unitnum] = texture;
1298 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 dpsoftrast.pointer_vertex3f = vertex3f;
1304 dpsoftrast.stride_vertex = stride;
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 dpsoftrast.pointer_color4f = color4f;
1309 dpsoftrast.pointer_color4ub = NULL;
1310 dpsoftrast.stride_color = stride;
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 dpsoftrast.pointer_color4f = NULL;
1315 dpsoftrast.pointer_color4ub = color4ub;
1316 dpsoftrast.stride_color = stride;
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322 dpsoftrast.stride_texcoord[unitnum] = stride;
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 thread->shader_mode = command->mode;
1329 thread->shader_permutation = command->permutation;
1330 thread->shader_exactspecularmath = command->exactspecularmath;
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335 command->mode = mode;
1336 command->permutation = permutation;
1337 command->exactspecularmath = exactspecularmath;
1339 dpsoftrast.shader_mode = mode;
1340 dpsoftrast.shader_permutation = permutation;
1341 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352 command->index = index;
1353 command->val[0] = v0;
1354 command->val[1] = v1;
1355 command->val[2] = v2;
1356 command->val[3] = v3;
1358 dpsoftrast.uniform4f[index*4+0] = v0;
1359 dpsoftrast.uniform4f[index*4+1] = v1;
1360 dpsoftrast.uniform4f[index*4+2] = v2;
1361 dpsoftrast.uniform4f[index*4+3] = v3;
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366 command->index = index;
1367 memcpy(command->val, v, sizeof(command->val));
1369 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1381 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383 __m128 m0, m1, m2, m3;
1384 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385 command->index = (DPSOFTRAST_UNIFORM)index;
1386 if (((size_t)v)&(ALIGN_SIZE-1))
1388 m0 = _mm_loadu_ps(v);
1389 m1 = _mm_loadu_ps(v+4);
1390 m2 = _mm_loadu_ps(v+8);
1391 m3 = _mm_loadu_ps(v+12);
1395 m0 = _mm_load_ps(v);
1396 m1 = _mm_load_ps(v+4);
1397 m2 = _mm_load_ps(v+8);
1398 m3 = _mm_load_ps(v+12);
1402 __m128 t0, t1, t2, t3;
1403 t0 = _mm_unpacklo_ps(m0, m1);
1404 t1 = _mm_unpacklo_ps(m2, m3);
1405 t2 = _mm_unpackhi_ps(m0, m1);
1406 t3 = _mm_unpackhi_ps(m2, m3);
1407 m0 = _mm_movelh_ps(t0, t1);
1408 m1 = _mm_movehl_ps(t1, t0);
1409 m2 = _mm_movelh_ps(t2, t3);
1410 m3 = _mm_movehl_ps(t3, t2);
1412 _mm_store_ps(command->val, m0);
1413 _mm_store_ps(command->val+4, m1);
1414 _mm_store_ps(command->val+8, m2);
1415 _mm_store_ps(command->val+12, m3);
1416 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 thread->uniform1i[command->index] = command->val;
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432 command->index = index;
1435 dpsoftrast.uniform1i[command->index] = i0;
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1441 float *end = dst + size*4;
1442 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1446 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1455 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 float *end = dst + size*4;
1465 if (stride == sizeof(float[3]))
1467 float *end4 = dst + (size&~3)*4;
1468 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1472 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1473 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486 src += 4*sizeof(float[3]);
1493 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 src += 4*sizeof(float[3]);
1511 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1515 __m128 v = _mm_loadu_ps((const float *)src);
1516 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519 _mm_store_ps(dst, v);
1528 __m128 v = _mm_load_ps((const float *)src);
1529 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532 _mm_store_ps(dst, v);
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1541 float *end = dst + size*4;
1542 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543 if (stride == sizeof(float[2]))
1545 float *end2 = dst + (size&~1)*4;
1546 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1554 src += 2*sizeof(float[2]);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1565 src += 2*sizeof(float[2]);
1571 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1579 float *end = dst + size*4;
1580 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581 if (stride == sizeof(unsigned char[4]))
1583 float *end4 = dst + (size&~3)*4;
1584 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1588 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1607 src += 4*sizeof(unsigned char[4]);
1613 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1622 float *end = dst + 4*size;
1623 __m128 v = _mm_loadu_ps(src);
1626 _mm_store_ps(dst, v);
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1635 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636 __m128 m0, m1, m2, m3;
1638 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1640 // fast case for identity matrix
1641 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1644 end = out4f + numitems*4;
1645 m0 = _mm_loadu_ps(inmatrix16f);
1646 m1 = _mm_loadu_ps(inmatrix16f + 4);
1647 m2 = _mm_loadu_ps(inmatrix16f + 8);
1648 m3 = _mm_loadu_ps(inmatrix16f + 12);
1649 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1653 __m128 v = _mm_loadu_ps(in4f);
1655 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1667 __m128 v = _mm_load_ps(in4f);
1669 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1682 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1688 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1696 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1705 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1713 int clipmask = 0xFF;
1714 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722 #define BBFRONT(k, pos) \
1724 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1729 clipmask &= ~(1<<k); \
1730 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1736 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1737 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1738 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1739 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1740 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1741 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1745 if (clipmask&(1<<k)) \
1747 if (!(clipmask&(1<<(k^1)))) \
1749 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752 minproj = _mm_min_ss(minproj, proj); \
1753 maxproj = _mm_max_ss(maxproj, proj); \
1755 if (!(clipmask&(1<<(k^2)))) \
1757 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760 minproj = _mm_min_ss(minproj, proj); \
1761 maxproj = _mm_max_ss(maxproj, proj); \
1763 if (!(clipmask&(1<<(k^4)))) \
1765 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780 *starty = _mm_cvttss_si32(maxproj);
1781 *endy = _mm_cvttss_si32(minproj)+1;
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1787 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788 float *end = out4f + numitems*4;
1789 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790 __m128 minpos, maxpos;
1791 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1793 minpos = maxpos = _mm_loadu_ps(in4f);
1796 __m128 v = _mm_loadu_ps(in4f);
1797 minpos = _mm_min_ps(minpos, v);
1798 maxpos = _mm_max_ps(maxpos, v);
1799 _mm_store_ps(out4f, v);
1800 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801 _mm_store_ps(screen4f, v);
1809 minpos = maxpos = _mm_load_ps(in4f);
1812 __m128 v = _mm_load_ps(in4f);
1813 minpos = _mm_min_ps(minpos, v);
1814 maxpos = _mm_max_ps(maxpos, v);
1815 _mm_store_ps(out4f, v);
1816 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817 _mm_store_ps(screen4f, v);
1825 ALIGN(float minposf[4]);
1826 ALIGN(float maxposf[4]);
1827 _mm_store_ps(minposf, minpos);
1828 _mm_store_ps(maxposf, maxpos);
1829 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1836 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1839 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841 end = out4f + numitems*4;
1842 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844 m0 = _mm_loadu_ps(inmatrix16f);
1845 m1 = _mm_loadu_ps(inmatrix16f + 4);
1846 m2 = _mm_loadu_ps(inmatrix16f + 8);
1847 m3 = _mm_loadu_ps(inmatrix16f + 12);
1848 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1850 minpos = maxpos = _mm_loadu_ps(in4f);
1853 __m128 v = _mm_loadu_ps(in4f);
1854 minpos = _mm_min_ps(minpos, v);
1855 maxpos = _mm_max_ps(maxpos, v);
1856 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857 _mm_store_ps(out4f, v);
1858 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859 _mm_store_ps(screen4f, v);
1867 minpos = maxpos = _mm_load_ps(in4f);
1870 __m128 v = _mm_load_ps(in4f);
1871 minpos = _mm_min_ps(minpos, v);
1872 maxpos = _mm_max_ps(maxpos, v);
1873 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874 _mm_store_ps(out4f, v);
1875 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876 _mm_store_ps(screen4f, v);
1884 ALIGN(float minposf[4]);
1885 ALIGN(float maxposf[4]);
1886 _mm_store_ps(minposf, minpos);
1887 _mm_store_ps(maxposf, maxpos);
1888 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1897 float *outf = dpsoftrast.post_array4f[outarray];
1898 const unsigned char *inb;
1899 int firstvertex = dpsoftrast.firstvertex;
1900 int numvertices = dpsoftrast.numvertices;
1904 case DPSOFTRAST_ARRAY_POSITION:
1905 stride = dpsoftrast.stride_vertex;
1906 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1909 case DPSOFTRAST_ARRAY_COLOR:
1910 stride = dpsoftrast.stride_color;
1911 if (dpsoftrast.pointer_color4f)
1913 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916 else if (dpsoftrast.pointer_color4ub)
1918 stride = dpsoftrast.stride_color;
1919 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1924 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1928 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1931 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1935 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1938 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1955 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1964 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1976 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1987 int startx = span->startx;
1988 int endx = span->endx;
1989 float wslope = triangle->w[0];
1990 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991 float endz = 1.0f / (w + wslope * startx);
1992 if (triangle->w[0] == 0)
1994 // LordHavoc: fast flat polygons (HUD/menu)
1995 for (x = startx;x < endx;x++)
1999 for (x = startx;x < endx;)
2001 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2003 if (nextsub >= endx) nextsub = endsub = endx-1;
2004 endz = 1.0f / (w + wslope * nextsub);
2005 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2006 for (; x <= endsub; x++, z += dz)
2011 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2014 int startx = span->startx;
2015 int endx = span->endx;
2018 unsigned char * RESTRICT pixelmask = span->pixelmask;
2019 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2022 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2023 // handle alphatest now (this affects depth writes too)
2024 if (thread->alphatest)
2025 for (x = startx;x < endx;x++)
2026 if (in4f[x*4+3] < 0.5f)
2027 pixelmask[x] = false;
2028 // FIXME: this does not handle bigendian
2029 switch(thread->fb_blendmode)
2031 case DPSOFTRAST_BLENDMODE_OPAQUE:
2032 for (x = startx;x < endx;x++)
2036 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2037 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2038 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2039 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2040 pixel[x*4+0] = d[0];
2041 pixel[x*4+1] = d[1];
2042 pixel[x*4+2] = d[2];
2043 pixel[x*4+3] = d[3];
2046 case DPSOFTRAST_BLENDMODE_ALPHA:
2047 for (x = startx;x < endx;x++)
2051 a = in4f[x*4+3] * 255.0f;
2052 b = 1.0f - in4f[x*4+3];
2053 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064 for (x = startx;x < endx;x++)
2068 a = in4f[x*4+3] * 255.0f;
2069 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2070 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2071 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2072 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2073 pixel[x*4+0] = d[0];
2074 pixel[x*4+1] = d[1];
2075 pixel[x*4+2] = d[2];
2076 pixel[x*4+3] = d[3];
2079 case DPSOFTRAST_BLENDMODE_ADD:
2080 for (x = startx;x < endx;x++)
2084 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2085 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2086 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2087 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2088 pixel[x*4+0] = d[0];
2089 pixel[x*4+1] = d[1];
2090 pixel[x*4+2] = d[2];
2091 pixel[x*4+3] = d[3];
2094 case DPSOFTRAST_BLENDMODE_INVMOD:
2095 for (x = startx;x < endx;x++)
2099 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2100 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2101 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2102 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2103 pixel[x*4+0] = d[0];
2104 pixel[x*4+1] = d[1];
2105 pixel[x*4+2] = d[2];
2106 pixel[x*4+3] = d[3];
2109 case DPSOFTRAST_BLENDMODE_MUL:
2110 for (x = startx;x < endx;x++)
2114 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2115 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2116 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2117 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2118 pixel[x*4+0] = d[0];
2119 pixel[x*4+1] = d[1];
2120 pixel[x*4+2] = d[2];
2121 pixel[x*4+3] = d[3];
2124 case DPSOFTRAST_BLENDMODE_MUL2:
2125 for (x = startx;x < endx;x++)
2129 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2130 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2131 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2132 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2133 pixel[x*4+0] = d[0];
2134 pixel[x*4+1] = d[1];
2135 pixel[x*4+2] = d[2];
2136 pixel[x*4+3] = d[3];
2139 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2140 for (x = startx;x < endx;x++)
2144 a = in4f[x*4+3] * -255.0f;
2145 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2146 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2147 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2148 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2149 pixel[x*4+0] = d[0];
2150 pixel[x*4+1] = d[1];
2151 pixel[x*4+2] = d[2];
2152 pixel[x*4+3] = d[3];
2155 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2156 for (x = startx;x < endx;x++)
2161 b = 1.0f - in4f[x*4+3];
2162 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2163 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2164 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2165 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2166 pixel[x*4+0] = d[0];
2167 pixel[x*4+1] = d[1];
2168 pixel[x*4+2] = d[2];
2169 pixel[x*4+3] = d[3];
2172 case DPSOFTRAST_BLENDMODE_INVADD:
2173 for (x = startx;x < endx;x++)
2177 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2178 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2179 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2180 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2181 pixel[x*4+0] = d[0];
2182 pixel[x*4+1] = d[1];
2183 pixel[x*4+2] = d[2];
2184 pixel[x*4+3] = d[3];
2190 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2194 int startx = span->startx;
2195 int endx = span->endx;
2196 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2197 unsigned char * RESTRICT pixelmask = span->pixelmask;
2198 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2199 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2202 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2203 pixeli += span->y * dpsoftrast.fb_width + span->x;
2204 // handle alphatest now (this affects depth writes too)
2205 if (thread->alphatest)
2206 for (x = startx;x < endx;x++)
2207 if (in4ub[x*4+3] < 0.5f)
2208 pixelmask[x] = false;
2209 // FIXME: this does not handle bigendian
2210 switch(thread->fb_blendmode)
2212 case DPSOFTRAST_BLENDMODE_OPAQUE:
2213 for (x = startx;x + 4 <= endx;)
2215 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2217 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2231 case DPSOFTRAST_BLENDMODE_ALPHA:
2232 #define FINISHBLEND(blend2, blend1) \
2233 for (x = startx;x + 1 < endx;x += 2) \
2236 switch (*(const unsigned short*)&pixelmask[x]) \
2239 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2240 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2242 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2245 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2246 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2248 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2251 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2252 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2254 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2259 for(;x < endx; x++) \
2262 if (!pixelmask[x]) \
2264 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2265 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2267 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2271 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2274 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2278 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2280 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2281 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2283 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2284 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2287 case DPSOFTRAST_BLENDMODE_ADD:
2288 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2290 case DPSOFTRAST_BLENDMODE_INVMOD:
2292 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2294 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2297 case DPSOFTRAST_BLENDMODE_MUL:
2298 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2300 case DPSOFTRAST_BLENDMODE_MUL2:
2301 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2303 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2305 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2306 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2308 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2309 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2312 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2314 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2315 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2317 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2318 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2321 case DPSOFTRAST_BLENDMODE_INVADD:
2323 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2325 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2332 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2335 int startx = span->startx;
2336 int endx = span->endx;
2341 float tc[2], endtc[2];
2343 unsigned int tci[2];
2344 unsigned int tci1[2];
2345 unsigned int tcimin[2];
2346 unsigned int tcimax[2];
2351 const unsigned char * RESTRICT pixelbase;
2352 const unsigned char * RESTRICT pixel[4];
2353 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2354 // if no texture is bound, just fill it with white
2357 for (x = startx;x < endx;x++)
2359 out4f[x*4+0] = 1.0f;
2360 out4f[x*4+1] = 1.0f;
2361 out4f[x*4+2] = 1.0f;
2362 out4f[x*4+3] = 1.0f;
2366 mip = triangle->mip[texunitindex];
2367 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2368 // if this mipmap of the texture is 1 pixel, just fill it with that color
2369 if (texture->mipmap[mip][1] == 4)
2371 c[0] = texture->bytes[2] * (1.0f/255.0f);
2372 c[1] = texture->bytes[1] * (1.0f/255.0f);
2373 c[2] = texture->bytes[0] * (1.0f/255.0f);
2374 c[3] = texture->bytes[3] * (1.0f/255.0f);
2375 for (x = startx;x < endx;x++)
2377 out4f[x*4+0] = c[0];
2378 out4f[x*4+1] = c[1];
2379 out4f[x*4+2] = c[2];
2380 out4f[x*4+3] = c[3];
2384 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2385 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2386 flags = texture->flags;
2387 tcscale[0] = texture->mipmap[mip][2];
2388 tcscale[1] = texture->mipmap[mip][3];
2389 tciwidth = texture->mipmap[mip][2];
2392 tcimax[0] = texture->mipmap[mip][2]-1;
2393 tcimax[1] = texture->mipmap[mip][3]-1;
2394 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2395 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2396 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2397 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2403 for (x = startx;x < endx;)
2405 unsigned int subtc[2];
2406 unsigned int substep[2];
2407 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2408 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2409 if (nextsub >= endx)
2411 nextsub = endsub = endx-1;
2412 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2416 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2417 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2423 substep[0] = (endtc[0] - tc[0]) * subscale;
2424 substep[1] = (endtc[1] - tc[1]) * subscale;
2425 subtc[0] = tc[0] * (1<<12);
2426 subtc[1] = tc[1] * (1<<12);
2429 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2431 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2433 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2434 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2435 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2436 tci[0] = subtc[0]>>12;
2437 tci[1] = subtc[1]>>12;
2438 tci1[0] = tci[0] + 1;
2439 tci1[1] = tci[1] + 1;
2440 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2443 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2444 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2445 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2446 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2447 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2448 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2449 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2450 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2451 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2452 out4f[x*4+0] = c[0];
2453 out4f[x*4+1] = c[1];
2454 out4f[x*4+2] = c[2];
2455 out4f[x*4+3] = c[3];
2460 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2462 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465 tci[0] = subtc[0]>>12;
2466 tci[1] = subtc[1]>>12;
2467 tci1[0] = tci[0] + 1;
2468 tci1[1] = tci[1] + 1;
2469 tci[0] &= tciwrapmask[0];
2470 tci[1] &= tciwrapmask[1];
2471 tci1[0] &= tciwrapmask[0];
2472 tci1[1] &= tciwrapmask[1];
2473 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481 out4f[x*4+0] = c[0];
2482 out4f[x*4+1] = c[1];
2483 out4f[x*4+2] = c[2];
2484 out4f[x*4+3] = c[3];
2488 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2490 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2492 tci[0] = subtc[0]>>12;
2493 tci[1] = subtc[1]>>12;
2494 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2495 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2496 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2497 c[0] = pixel[0][2] * (1.0f / 255.0f);
2498 c[1] = pixel[0][1] * (1.0f / 255.0f);
2499 c[2] = pixel[0][0] * (1.0f / 255.0f);
2500 c[3] = pixel[0][3] * (1.0f / 255.0f);
2501 out4f[x*4+0] = c[0];
2502 out4f[x*4+1] = c[1];
2503 out4f[x*4+2] = c[2];
2504 out4f[x*4+3] = c[3];
2509 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2511 tci[0] = subtc[0]>>12;
2512 tci[1] = subtc[1]>>12;
2513 tci[0] &= tciwrapmask[0];
2514 tci[1] &= tciwrapmask[1];
2515 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516 c[0] = pixel[0][2] * (1.0f / 255.0f);
2517 c[1] = pixel[0][1] * (1.0f / 255.0f);
2518 c[2] = pixel[0][0] * (1.0f / 255.0f);
2519 c[3] = pixel[0][3] * (1.0f / 255.0f);
2520 out4f[x*4+0] = c[0];
2521 out4f[x*4+1] = c[1];
2522 out4f[x*4+2] = c[2];
2523 out4f[x*4+3] = c[3];
2529 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2533 int startx = span->startx;
2534 int endx = span->endx;
2536 __m128 data, slope, tcscale;
2537 __m128i tcsize, tcmask, tcoffset, tcmax;
2539 __m128i subtc, substep, endsubtc;
2542 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2543 const unsigned char * RESTRICT pixelbase;
2544 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2545 // if no texture is bound, just fill it with white
2548 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2551 mip = triangle->mip[texunitindex];
2552 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2553 // if this mipmap of the texture is 1 pixel, just fill it with that color
2554 if (texture->mipmap[mip][1] == 4)
2556 unsigned int k = *((const unsigned int *)pixelbase);
2557 for (x = startx;x < endx;x++)
2561 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2562 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2563 flags = texture->flags;
2564 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2565 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2566 tcscale = _mm_cvtepi32_ps(tcsize);
2567 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2568 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2569 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2571 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2572 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2573 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2574 tcmax = _mm_packs_epi32(tcmask, tcmask);
2575 for (x = startx;x < endx;)
2577 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2578 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2579 if (nextsub >= endx)
2581 nextsub = endsub = endx-1;
2582 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2586 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2588 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2589 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2590 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2591 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2592 substep = _mm_slli_epi32(substep, 1);
2595 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2596 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2598 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2599 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2601 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2602 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2603 tci = _mm_madd_epi16(tci, tcoffset);
2604 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2605 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2606 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2607 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2608 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2609 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2610 fracm = _mm_srli_epi16(subtc, 1);
2611 pix1 = _mm_add_epi16(pix1,
2612 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2613 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2614 pix3 = _mm_add_epi16(pix3,
2615 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2616 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2617 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2618 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2619 pix2 = _mm_add_epi16(pix2,
2620 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2621 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2622 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2626 const unsigned char * RESTRICT ptr1;
2627 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2628 tci = _mm_madd_epi16(tci, tcoffset);
2629 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2630 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2631 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2632 fracm = _mm_srli_epi16(subtc, 1);
2633 pix1 = _mm_add_epi16(pix1,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2637 pix1 = _mm_add_epi16(pix1,
2638 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2640 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2644 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2646 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2648 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2649 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2650 tci = _mm_madd_epi16(tci, tcoffset);
2651 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2652 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2653 _mm_setzero_si128());
2654 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2655 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2656 _mm_setzero_si128());
2657 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2658 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 fracm = _mm_srli_epi16(subtc, 1);
2667 pix1 = _mm_add_epi16(pix1,
2668 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670 pix3 = _mm_add_epi16(pix3,
2671 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2672 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2673 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2674 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2675 pix2 = _mm_add_epi16(pix2,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2677 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2678 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2682 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2683 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2684 tci = _mm_madd_epi16(tci, tcoffset);
2685 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687 _mm_setzero_si128());
2688 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690 _mm_setzero_si128());
2691 fracm = _mm_srli_epi16(subtc, 1);
2692 pix1 = _mm_add_epi16(pix1,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2696 pix1 = _mm_add_epi16(pix1,
2697 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2699 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2705 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2707 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2708 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2709 tci = _mm_madd_epi16(tci, tcoffset);
2710 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2711 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2712 _mm_setzero_si128());
2713 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2714 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2715 _mm_setzero_si128());
2716 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2717 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2718 tci = _mm_madd_epi16(tci, tcoffset);
2719 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2720 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2721 _mm_setzero_si128());
2722 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2723 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2724 _mm_setzero_si128());
2725 fracm = _mm_srli_epi16(subtc, 1);
2726 pix1 = _mm_add_epi16(pix1,
2727 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2729 pix3 = _mm_add_epi16(pix3,
2730 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2731 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2732 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2733 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2734 pix2 = _mm_add_epi16(pix2,
2735 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2736 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2737 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2741 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2742 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2743 tci = _mm_madd_epi16(tci, tcoffset);
2744 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2745 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2746 _mm_setzero_si128());
2747 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2748 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2749 _mm_setzero_si128());
2750 fracm = _mm_srli_epi16(subtc, 1);
2751 pix1 = _mm_add_epi16(pix1,
2752 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2753 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2754 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2755 pix1 = _mm_add_epi16(pix1,
2756 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2757 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2758 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2765 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2767 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2769 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2770 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2771 tci = _mm_madd_epi16(tci, tcoffset);
2772 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2773 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2777 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2778 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2779 tci = _mm_madd_epi16(tci, tcoffset);
2780 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2786 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2788 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789 tci = _mm_and_si128(tci, tcmax);
2790 tci = _mm_madd_epi16(tci, tcoffset);
2791 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2796 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797 tci = _mm_and_si128(tci, tcmax);
2798 tci = _mm_madd_epi16(tci, tcoffset);
2799 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2808 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2811 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2814 float DPSOFTRAST_SampleShadowmap(const float *vector)
2820 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2823 int startx = span->startx;
2824 int endx = span->endx;
2829 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2830 for (x = startx;x < endx;x++)
2833 c[0] = (data[0] + slope[0]*x) * z;
2834 c[1] = (data[1] + slope[1]*x) * z;
2835 c[2] = (data[2] + slope[2]*x) * z;
2836 c[3] = (data[3] + slope[3]*x) * z;
2837 out4f[x*4+0] = in4f[x*4+0] * c[0];
2838 out4f[x*4+1] = in4f[x*4+1] * c[1];
2839 out4f[x*4+2] = in4f[x*4+2] * c[2];
2840 out4f[x*4+3] = in4f[x*4+3] * c[3];
2844 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2847 int startx = span->startx;
2848 int endx = span->endx;
2853 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2854 for (x = startx;x < endx;x++)
2857 c[0] = (data[0] + slope[0]*x) * z;
2858 c[1] = (data[1] + slope[1]*x) * z;
2859 c[2] = (data[2] + slope[2]*x) * z;
2860 c[3] = (data[3] + slope[3]*x) * z;
2861 out4f[x*4+0] = c[0];
2862 out4f[x*4+1] = c[1];
2863 out4f[x*4+2] = c[2];
2864 out4f[x*4+3] = c[3];
2868 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2870 int x, startx = span->startx, endx = span->endx;
2871 float c[4], localcolor[4];
2872 localcolor[0] = subcolor[0];
2873 localcolor[1] = subcolor[1];
2874 localcolor[2] = subcolor[2];
2875 localcolor[3] = subcolor[3];
2876 for (x = startx;x < endx;x++)
2878 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2879 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2880 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2881 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2882 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2883 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2884 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2885 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2889 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2891 int x, startx = span->startx, endx = span->endx;
2892 for (x = startx;x < endx;x++)
2894 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2895 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2896 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2897 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2901 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2903 int x, startx = span->startx, endx = span->endx;
2904 for (x = startx;x < endx;x++)
2906 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2907 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2908 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2909 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2913 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2915 int x, startx = span->startx, endx = span->endx;
2917 for (x = startx;x < endx;x++)
2919 a = 1.0f - inb4f[x*4+3];
2921 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2922 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2923 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2924 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2928 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2930 int x, startx = span->startx, endx = span->endx;
2931 float localcolor[4], ilerp, lerp;
2932 localcolor[0] = color[0];
2933 localcolor[1] = color[1];
2934 localcolor[2] = color[2];
2935 localcolor[3] = color[3];
2936 ilerp = 1.0f - localcolor[3];
2937 lerp = localcolor[3];
2938 for (x = startx;x < endx;x++)
2940 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2941 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2942 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2943 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2949 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2953 int startx = span->startx;
2954 int endx = span->endx;
2957 __m128i submod, substep, endsubmod;
2958 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2959 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2960 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2961 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2962 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2963 for (x = startx; x < endx;)
2965 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2966 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2967 if (nextsub >= endx)
2969 nextsub = endsub = endx-1;
2970 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2974 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2975 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2976 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2977 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2978 substep = _mm_packs_epi32(substep, substep);
2979 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2981 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2982 pix = _mm_mulhi_epu16(pix, submod);
2983 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2987 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2988 pix = _mm_mulhi_epu16(pix, submod);
2989 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2996 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3000 int startx = span->startx;
3001 int endx = span->endx;
3004 __m128i submod, substep, endsubmod;
3005 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3006 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3007 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3008 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3009 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3010 for (x = startx; x < endx;)
3012 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3013 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3014 if (nextsub >= endx)
3016 nextsub = endsub = endx-1;
3017 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3021 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3022 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3023 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3024 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3025 substep = _mm_packs_epi32(substep, substep);
3026 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3028 __m128i pix = _mm_srai_epi16(submod, 4);
3029 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3033 __m128i pix = _mm_srai_epi16(submod, 4);
3034 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3041 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3044 int x, startx = span->startx, endx = span->endx;
3045 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3046 localcolor = _mm_packs_epi32(localcolor, localcolor);
3047 for (x = startx;x+2 <= endx;x+=2)
3049 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3051 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3052 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3058 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3059 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3064 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3067 int x, startx = span->startx, endx = span->endx;
3068 for (x = startx;x+2 <= endx;x+=2)
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3072 pix1 = _mm_mulhi_epu16(pix1, pix2);
3073 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3079 pix1 = _mm_mulhi_epu16(pix1, pix2);
3080 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3085 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3088 int x, startx = span->startx, endx = span->endx;
3089 for (x = startx;x+2 <= endx;x+=2)
3091 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3092 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3093 pix1 = _mm_add_epi16(pix1, pix2);
3094 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3098 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3099 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3100 pix1 = _mm_add_epi16(pix1, pix2);
3101 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3106 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3109 int x, startx = span->startx, endx = span->endx;
3110 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3111 tint = _mm_packs_epi32(tint, tint);
3112 for (x = startx;x+2 <= endx;x+=2)
3114 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3117 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3121 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3124 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3129 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 int x, startx = span->startx, endx = span->endx;
3133 for (x = startx;x+2 <= endx;x+=2)
3135 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3138 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3139 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3143 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3146 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3147 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3152 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3155 int x, startx = span->startx, endx = span->endx;
3156 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3157 localcolor = _mm_packs_epi32(localcolor, localcolor);
3158 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3159 for (x = startx;x+2 <= endx;x+=2)
3161 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3162 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3163 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3167 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3168 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3169 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3176 void DPSOFTRAST_VertexShader_Generic(void)
3178 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3179 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3180 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3181 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3182 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3185 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3187 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3188 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3189 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3192 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3194 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3195 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3196 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3198 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3199 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3202 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3204 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3207 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3209 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3212 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3217 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3218 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3223 void DPSOFTRAST_VertexShader_PostProcess(void)
3225 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3227 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3230 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3232 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3233 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3234 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3237 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3238 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3240 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3241 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3243 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3244 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3246 // TODO: implement saturation
3248 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3250 // TODO: implement gammaramps
3252 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3257 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3259 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3262 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3264 // this is never called (because colormask is off when this shader is used)
3265 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3266 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3267 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3268 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3269 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 void DPSOFTRAST_VertexShader_FlatColor(void)
3276 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3280 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 unsigned char * RESTRICT pixelmask = span->pixelmask;
3284 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285 int x, startx = span->startx, endx = span->endx;
3286 __m128i Color_Ambientm;
3287 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3288 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3289 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3291 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3292 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3293 pixel = buffer_FragColorbgra8;
3294 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3295 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3296 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3297 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3298 for (x = startx;x < endx;x++)
3301 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3304 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3305 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3306 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3307 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3313 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3314 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3315 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3317 if (pixel == buffer_FragColorbgra8)
3318 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3324 void DPSOFTRAST_VertexShader_VertexColor(void)
3326 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3327 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3328 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3331 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3334 unsigned char * RESTRICT pixelmask = span->pixelmask;
3335 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3336 int x, startx = span->startx, endx = span->endx;
3337 __m128i Color_Ambientm, Color_Diffusem;
3339 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3340 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3341 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3342 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3343 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3344 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3345 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3346 pixel = buffer_FragColorbgra8;
3347 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3348 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3349 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3350 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3351 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3352 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3353 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3354 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3355 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3356 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3357 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3358 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3359 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3360 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3362 __m128i color, mod, pix;
3363 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3366 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3367 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3368 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3369 data = _mm_add_ps(data, slope);
3370 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3371 data = _mm_add_ps(data, slope);
3372 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3373 data = _mm_add_ps(data, slope);
3374 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3375 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3376 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3377 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3378 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3379 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3385 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3386 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3387 mod = _mm_packs_epi32(mod, mod);
3388 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3389 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3391 if (pixel == buffer_FragColorbgra8)
3392 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3398 void DPSOFTRAST_VertexShader_Lightmap(void)
3400 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3401 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3402 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3405 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3408 unsigned char * RESTRICT pixelmask = span->pixelmask;
3409 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3410 int x, startx = span->startx, endx = span->endx;
3411 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3412 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3413 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3414 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3415 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3416 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3417 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3418 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3419 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3420 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3421 pixel = buffer_FragColorbgra8;
3422 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3423 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3424 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3425 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3426 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3427 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3428 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3429 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3431 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3432 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3433 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3434 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3435 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3436 for (x = startx;x < endx;x++)
3438 __m128i color, lightmap, glow, pix;
3439 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3442 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3443 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3444 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3445 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3446 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3447 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3448 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3449 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3450 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3451 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3457 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3458 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3459 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3460 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3461 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3462 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3467 for (x = startx;x < endx;x++)
3469 __m128i color, lightmap, pix;
3470 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3473 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3474 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3475 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3476 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3477 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3478 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3479 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3485 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3486 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3487 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3488 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3491 if (pixel == buffer_FragColorbgra8)
3492 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3497 void DPSOFTRAST_VertexShader_LightDirection(void);
3498 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3500 void DPSOFTRAST_VertexShader_FakeLight(void)
3502 DPSOFTRAST_VertexShader_LightDirection();
3505 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3507 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3512 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3514 DPSOFTRAST_VertexShader_LightDirection();
3515 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3518 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3520 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3525 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3527 DPSOFTRAST_VertexShader_LightDirection();
3528 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3531 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3533 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3538 void DPSOFTRAST_VertexShader_LightDirection(void)
3541 int numvertices = dpsoftrast.numvertices;
3543 float LightVector[4];
3544 float EyePosition[4];
3545 float EyeVectorModelSpace[4];
3551 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3552 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3553 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3554 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3555 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3556 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3557 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3558 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3559 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3560 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3561 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3562 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3563 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3564 for (i = 0;i < numvertices;i++)
3566 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3567 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3568 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3569 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3570 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3571 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3572 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3573 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3574 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3575 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3576 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3577 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3578 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3579 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3580 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3581 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3582 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3583 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3584 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3585 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3586 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3587 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3588 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3589 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3590 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3591 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3592 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3593 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3594 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3596 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3599 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3600 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3601 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3602 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3603 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3604 #define DPSOFTRAST_Vector3Normalize(v)\
3607 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3618 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3620 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3621 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3622 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3623 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3624 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3625 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3626 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3627 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3628 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3629 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3630 int x, startx = span->startx, endx = span->endx;
3631 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3632 float LightVectordata[4];
3633 float LightVectorslope[4];
3634 float EyeVectordata[4];
3635 float EyeVectorslope[4];
3636 float VectorSdata[4];
3637 float VectorSslope[4];
3638 float VectorTdata[4];
3639 float VectorTslope[4];
3640 float VectorRdata[4];
3641 float VectorRslope[4];
3643 float diffusetex[4];
3645 float surfacenormal[4];
3646 float lightnormal[4];
3647 float lightnormal_modelspace[4];
3649 float specularnormal[4];
3652 float SpecularPower;
3654 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3655 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3656 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3657 Color_Glow[3] = 0.0f;
3658 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3659 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3660 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3661 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3662 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3663 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3664 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3665 Color_Pants[3] = 0.0f;
3666 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3667 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3668 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3669 Color_Shirt[3] = 0.0f;
3670 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3671 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3672 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3674 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3675 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3677 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3679 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3681 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3683 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3684 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3685 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3686 Color_Diffuse[3] = 0.0f;
3687 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3688 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3689 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3690 LightColor[3] = 0.0f;
3691 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3693 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3694 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3695 Color_Specular[3] = 0.0f;
3696 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3697 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3698 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3700 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3702 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3703 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3704 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3705 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3706 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3708 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3710 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3711 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3713 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3715 // nothing of this needed
3719 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3722 for (x = startx;x < endx;x++)
3725 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3726 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3727 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3728 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3729 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3731 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3732 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3733 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3734 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3736 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3737 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3738 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3739 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3740 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3741 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3742 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3743 DPSOFTRAST_Vector3Normalize(surfacenormal);
3745 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3747 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3748 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3749 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3750 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3752 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3753 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3754 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3755 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3757 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3758 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3759 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3760 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3762 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3763 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3764 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3765 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3767 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3768 DPSOFTRAST_Vector3Normalize(lightnormal);
3770 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3772 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3773 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3774 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3775 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3778 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3780 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3781 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3782 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3784 float f = 1.0f / 256.0f;
3785 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3786 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3787 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3790 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3792 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3793 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3794 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3795 DPSOFTRAST_Vector3Normalize(lightnormal);
3797 LightColor[0] = 1.0;
3798 LightColor[1] = 1.0;
3799 LightColor[2] = 1.0;
3803 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3804 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3805 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3806 DPSOFTRAST_Vector3Normalize(lightnormal);
3809 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3811 if(thread->shader_exactspecularmath)
3813 // reflect lightnormal at surfacenormal, take the negative of that
3814 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3816 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3817 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3818 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3819 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3821 // dot of this and normalize(EyeVectorFogDepth.xyz)
3822 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3823 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3824 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3825 DPSOFTRAST_Vector3Normalize(eyenormal);
3827 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3831 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3832 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3833 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3834 DPSOFTRAST_Vector3Normalize(eyenormal);
3836 specularnormal[0] = lightnormal[0] + eyenormal[0];
3837 specularnormal[1] = lightnormal[1] + eyenormal[1];
3838 specularnormal[2] = lightnormal[2] + eyenormal[2];
3839 DPSOFTRAST_Vector3Normalize(specularnormal);
3841 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3844 specular = pow(specular, SpecularPower * glosstex[3]);
3845 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3847 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3848 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3849 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3850 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3854 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3855 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3856 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3857 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3860 buffer_FragColorbgra8[x*4+0] = d[0];
3861 buffer_FragColorbgra8[x*4+1] = d[1];
3862 buffer_FragColorbgra8[x*4+2] = d[2];
3863 buffer_FragColorbgra8[x*4+3] = d[3];
3866 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3868 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3869 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3870 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3871 Color_Diffuse[3] = 0.0f;
3872 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3873 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3874 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3875 LightColor[3] = 0.0f;
3876 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3878 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3880 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3881 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3882 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3883 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3884 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3886 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3888 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3889 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3891 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3893 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3897 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3900 for (x = startx;x < endx;x++)
3903 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3904 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3905 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3906 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3907 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3908 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3909 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3910 DPSOFTRAST_Vector3Normalize(surfacenormal);
3912 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3914 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3915 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3916 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3917 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3919 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3920 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3921 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3922 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3924 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3925 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3926 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3927 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3929 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3930 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3931 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3932 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3934 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3935 DPSOFTRAST_Vector3Normalize(lightnormal);
3937 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3939 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3940 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3941 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3942 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3945 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3947 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3948 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3949 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3951 float f = 1.0f / 256.0f;
3952 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3953 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3954 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3957 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3959 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3960 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3961 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3962 DPSOFTRAST_Vector3Normalize(lightnormal);
3964 LightColor[0] = 1.0;
3965 LightColor[1] = 1.0;
3966 LightColor[2] = 1.0;
3970 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3971 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3972 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3973 DPSOFTRAST_Vector3Normalize(lightnormal);
3976 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3979 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3980 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3981 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3982 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3986 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3987 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3988 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3989 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3991 buffer_FragColorbgra8[x*4+0] = d[0];
3992 buffer_FragColorbgra8[x*4+1] = d[1];
3993 buffer_FragColorbgra8[x*4+2] = d[2];
3994 buffer_FragColorbgra8[x*4+3] = d[3];
3999 for (x = startx;x < endx;x++)
4002 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4003 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4004 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4005 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4007 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4009 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4010 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4011 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4012 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4016 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4017 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4018 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4019 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4021 buffer_FragColorbgra8[x*4+0] = d[0];
4022 buffer_FragColorbgra8[x*4+1] = d[1];
4023 buffer_FragColorbgra8[x*4+2] = d[2];
4024 buffer_FragColorbgra8[x*4+3] = d[3];
4027 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4032 void DPSOFTRAST_VertexShader_LightSource(void)
4035 int numvertices = dpsoftrast.numvertices;
4036 float LightPosition[4];
4037 float LightVector[4];
4038 float LightVectorModelSpace[4];
4039 float EyePosition[4];
4040 float EyeVectorModelSpace[4];
4046 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4047 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4048 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4049 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4050 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4051 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4052 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4053 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4054 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4055 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4056 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4057 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4058 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4059 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4060 for (i = 0;i < numvertices;i++)
4062 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4063 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4064 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4065 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4066 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4067 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4068 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4069 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4070 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4071 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4072 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4073 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4074 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4075 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4076 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4077 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4078 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4079 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4080 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4081 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4082 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4083 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4084 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4085 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4086 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4087 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4088 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4089 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4090 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4091 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4092 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4093 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4095 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4096 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4099 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4102 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4103 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4104 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4105 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4106 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4107 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4108 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4109 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110 int x, startx = span->startx, endx = span->endx;
4111 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4112 float CubeVectordata[4];
4113 float CubeVectorslope[4];
4114 float LightVectordata[4];
4115 float LightVectorslope[4];
4116 float EyeVectordata[4];
4117 float EyeVectorslope[4];
4119 float diffusetex[4];
4121 float surfacenormal[4];
4122 float lightnormal[4];
4124 float specularnormal[4];
4127 float SpecularPower;
4128 float CubeVector[4];
4131 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4132 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4133 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4134 Color_Glow[3] = 0.0f;
4135 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4136 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4137 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4138 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4139 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4140 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4141 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4142 Color_Diffuse[3] = 0.0f;
4143 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4144 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4145 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4146 Color_Specular[3] = 0.0f;
4147 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4148 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4149 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4150 Color_Pants[3] = 0.0f;
4151 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4152 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4153 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4154 Color_Shirt[3] = 0.0f;
4155 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4156 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4157 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4158 LightColor[3] = 0.0f;
4159 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4160 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4161 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4162 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4163 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4164 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4165 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4166 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4168 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4169 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4171 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4172 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4173 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4175 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4176 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4177 for (x = startx;x < endx;x++)
4180 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4181 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4182 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4183 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4184 if (attenuation < 0.01f)
4186 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4188 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4189 if (attenuation < 0.01f)
4193 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4194 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4195 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4196 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4197 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4199 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4200 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4201 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4202 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4204 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4205 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4206 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4207 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4208 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4209 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4210 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4211 DPSOFTRAST_Vector3Normalize(surfacenormal);
4213 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4214 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4215 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4216 DPSOFTRAST_Vector3Normalize(lightnormal);
4218 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4220 if(thread->shader_exactspecularmath)
4222 // reflect lightnormal at surfacenormal, take the negative of that
4223 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4225 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4226 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4227 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4228 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4230 // dot of this and normalize(EyeVectorFogDepth.xyz)
4231 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4232 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4233 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4234 DPSOFTRAST_Vector3Normalize(eyenormal);
4236 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4240 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4241 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4242 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4243 DPSOFTRAST_Vector3Normalize(eyenormal);
4245 specularnormal[0] = lightnormal[0] + eyenormal[0];
4246 specularnormal[1] = lightnormal[1] + eyenormal[1];
4247 specularnormal[2] = lightnormal[2] + eyenormal[2];
4248 DPSOFTRAST_Vector3Normalize(specularnormal);
4250 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4252 specular = pow(specular, SpecularPower * glosstex[3]);
4254 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4256 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4257 attenuation *= (1.0f / 255.0f);
4258 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4259 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4260 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4261 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4265 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4266 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4267 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4268 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4270 buffer_FragColorbgra8[x*4+0] = d[0];
4271 buffer_FragColorbgra8[x*4+1] = d[1];
4272 buffer_FragColorbgra8[x*4+2] = d[2];
4273 buffer_FragColorbgra8[x*4+3] = d[3];
4276 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4278 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4279 for (x = startx;x < endx;x++)
4282 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4283 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4284 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4285 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4286 if (attenuation < 0.01f)
4288 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4290 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4291 if (attenuation < 0.01f)
4295 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4296 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4297 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4298 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4299 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4301 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4302 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4303 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4304 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4306 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4307 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4308 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4309 DPSOFTRAST_Vector3Normalize(surfacenormal);
4311 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4312 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4313 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4314 DPSOFTRAST_Vector3Normalize(lightnormal);
4316 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4317 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4319 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4320 attenuation *= (1.0f / 255.0f);
4321 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4322 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4323 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4324 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4328 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4329 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4330 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4331 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4333 buffer_FragColorbgra8[x*4+0] = d[0];
4334 buffer_FragColorbgra8[x*4+1] = d[1];
4335 buffer_FragColorbgra8[x*4+2] = d[2];
4336 buffer_FragColorbgra8[x*4+3] = d[3];
4341 for (x = startx;x < endx;x++)
4344 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4345 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4346 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4347 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4348 if (attenuation < 0.01f)
4350 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4352 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4353 if (attenuation < 0.01f)
4357 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4358 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4359 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4360 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4361 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4363 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4364 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4365 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4366 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4368 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4370 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4371 attenuation *= (1.0f / 255.0f);
4372 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4373 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4374 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4375 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4379 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4380 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4381 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4382 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4384 buffer_FragColorbgra8[x*4+0] = d[0];
4385 buffer_FragColorbgra8[x*4+1] = d[1];
4386 buffer_FragColorbgra8[x*4+2] = d[2];
4387 buffer_FragColorbgra8[x*4+3] = d[3];
4390 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4396 void DPSOFTRAST_VertexShader_Refraction(void)
4398 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4399 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4400 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4403 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4405 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4407 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4409 int x, startx = span->startx, endx = span->endx;
4412 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4413 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4416 float ModelViewProjectionPositiondata[4];
4417 float ModelViewProjectionPositionslope[4];
4420 float ScreenScaleRefractReflect[2];
4421 float ScreenCenterRefractReflect[2];
4422 float DistortScaleRefractReflect[2];
4423 float RefractColor[4];
4425 const unsigned char * RESTRICT pixelbase;
4426 const unsigned char * RESTRICT pixel[4];
4427 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4428 if(!texture) return;
4429 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4432 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4433 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4436 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4439 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4440 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4441 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4442 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4443 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4444 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4445 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4446 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4447 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4448 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4451 for (x = startx;x < endx;x++)
4453 float SafeScreenTexCoord[2];
4454 float ScreenTexCoord[2];
4461 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4462 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4464 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4465 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4466 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4468 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4469 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4470 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4471 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4472 DPSOFTRAST_Vector3Normalize(v);
4473 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4474 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4476 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4477 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4479 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4480 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4481 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4482 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4483 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4484 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4485 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4486 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4487 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4488 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4489 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4490 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4491 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4492 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4493 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4494 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4495 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4499 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4500 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4501 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4502 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4508 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4509 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4510 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4511 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4512 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4515 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4520 void DPSOFTRAST_VertexShader_Water(void)
4522 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4526 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4529 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4538 void DPSOFTRAST_VertexShader_ShowDepth(void)
4540 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4543 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4546 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4547 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4548 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4549 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4550 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4555 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4557 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4560 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4563 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4564 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4565 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4566 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4567 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4572 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4574 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4577 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4580 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4581 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4582 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4583 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4584 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4589 typedef struct DPSOFTRAST_ShaderModeInfo_s
4592 void (*Vertex)(void);
4593 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4594 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4595 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4597 DPSOFTRAST_ShaderModeInfo;
4599 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4601 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4602 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4603 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4604 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4605 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4606 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4607 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4608 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4609 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4610 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4611 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4612 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4613 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4614 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4615 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4616 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4619 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4626 // unsigned int *colorpixel;
4627 unsigned int *depthpixel;
4633 DPSOFTRAST_State_Triangle *triangle;
4634 DPSOFTRAST_State_Span *span;
4635 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4636 for (i = 0; i < thread->numspans; i++)
4638 span = &thread->spans[i];
4639 triangle = &thread->triangles[span->triangle];
4640 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4642 wslope = triangle->w[0];
4643 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4644 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4645 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4646 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4647 startx = span->startx;
4649 switch(thread->fb_depthfunc)
4652 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4653 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4654 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4655 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4656 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4657 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4658 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4660 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4661 //for (x = startx;x < endx;x++)
4662 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4663 // if there is no color buffer, skip pixel shader
4664 while (startx < endx && !pixelmask[startx])
4666 while (endx > startx && !pixelmask[endx-1])
4669 continue; // no pixels to fill
4670 span->pixelmask = pixelmask;
4671 span->startx = startx;
4673 // run pixel shader if appropriate
4674 // do this before running depthmask code, to allow the pixelshader
4675 // to clear pixelmask values for alpha testing
4676 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4677 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4678 if (thread->depthmask)
4679 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4685 // no depth testing means we're just dealing with color...
4686 // if there is no color buffer, skip pixel shader
4687 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4689 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4690 span->pixelmask = pixelmask;
4691 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4695 thread->numspans = 0;
4698 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4700 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4703 int cullface = thread->cullface;
4704 int minx, maxx, miny, maxy;
4705 int miny1, maxy1, miny2, maxy2;
4706 __m128i fbmin, fbmax;
4707 __m128 viewportcenter, viewportscale;
4708 int firstvertex = command->firstvertex;
4709 int numvertices = command->numvertices;
4710 int numtriangles = command->numtriangles;
4711 const int *element3i = command->element3i;
4712 const unsigned short *element3s = command->element3s;
4713 int clipped = command->clipped;
4720 int starty, endy, bandy;
4724 __m128 triangleedge1, triangleedge2, trianglenormal;
4727 DPSOFTRAST_State_Triangle *triangle;
4728 DPSOFTRAST_Texture *texture;
4729 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4730 miny = thread->fb_scissor[1];
4731 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4732 miny1 = bound(miny, thread->miny1, maxy);
4733 maxy1 = bound(miny, thread->maxy1, maxy);
4734 miny2 = bound(miny, thread->miny2, maxy);
4735 maxy2 = bound(miny, thread->maxy2, maxy);
4736 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4738 if (!ATOMIC_DECREMENT(command->refcount))
4740 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4741 MM_FREE(command->arrays);
4745 minx = thread->fb_scissor[0];
4746 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4747 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4748 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4749 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4750 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4751 screen[3] = _mm_setzero_ps();
4752 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4753 for (i = 0;i < numtriangles;i++)
4755 const float *screencoord4f = command->arrays;
4756 const float *arrays = screencoord4f + numvertices*4;
4758 // generate the 3 edges of this triangle
4759 // generate spans for the triangle - switch based on left split or right split classification of triangle
4762 e[0] = element3s[i*3+0] - firstvertex;
4763 e[1] = element3s[i*3+1] - firstvertex;
4764 e[2] = element3s[i*3+2] - firstvertex;
4768 e[0] = element3i[i*3+0] - firstvertex;
4769 e[1] = element3i[i*3+1] - firstvertex;
4770 e[2] = element3i[i*3+2] - firstvertex;
4779 #define SKIPBACKFACE \
4780 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4781 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4782 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4783 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4784 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4788 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4792 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4797 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4798 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4800 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4801 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4803 #define CLIPPEDVERTEXCOPY(k,p1) \
4804 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4806 #define GENATTRIBCOPY(attrib, p1) \
4807 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4808 #define GENATTRIBLERP(attrib, p1, p2) \
4810 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4811 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4813 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4817 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4818 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4819 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4820 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4821 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4822 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4823 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4829 // calculate distance from nearplane
4830 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4831 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4832 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4833 if (clipdist[0] >= 0.0f)
4835 if (clipdist[1] >= 0.0f)
4837 if (clipdist[2] >= 0.0f)
4840 // triangle is entirely in front of nearplane
4841 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4848 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4856 if (clipdist[2] >= 0.0f)
4858 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4865 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4872 else if (clipdist[1] >= 0.0f)
4874 if (clipdist[2] >= 0.0f)
4876 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4883 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4889 else if (clipdist[2] >= 0.0f)
4891 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4896 else continue; // triangle is entirely behind nearplane
4899 // calculate integer y coords for triangle points
4900 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4901 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4902 screenmin = _mm_min_epi16(screeni, screenir),
4903 screenmax = _mm_max_epi16(screeni, screenir);
4904 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4905 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4906 screenmin = _mm_max_epi16(screenmin, fbmin);
4907 screenmax = _mm_min_epi16(screenmax, fbmax);
4908 // skip offscreen triangles
4909 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4911 starty = _mm_extract_epi16(screenmin, 1);
4912 endy = _mm_extract_epi16(screenmax, 1)+1;
4913 if (starty >= maxy1 && endy <= miny2)
4915 screeny = _mm_srai_epi32(screeni, 16);
4918 triangle = &thread->triangles[thread->numtriangles];
4920 // calculate attribute plans for triangle data...
4921 // okay, this triangle is going to produce spans, we'd better project
4922 // the interpolants now (this is what gives perspective texturing),
4923 // this consists of simply multiplying all arrays by the W coord
4924 // (which is basically 1/Z), which will be undone per-pixel
4925 // (multiplying by Z again) to get the perspective-correct array
4928 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4929 __m128 mipedgescale, mipdensity;
4930 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4931 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4932 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4933 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4934 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4935 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4936 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4937 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4938 attribedge1 = _mm_sub_ss(w0, w1);
4939 attribedge2 = _mm_sub_ss(w2, w1);
4940 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4941 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4942 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4943 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4944 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4945 _mm_store_ss(&triangle->w[0], attribxslope);
4946 _mm_store_ss(&triangle->w[1], attribyslope);
4947 _mm_store_ss(&triangle->w[2], attriborigin);
4948 mipedgescale = _mm_setzero_ps();
4949 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4951 __m128 attrib0, attrib1, attrib2;
4952 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4953 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4955 arrays += numvertices*4;
4956 GENATTRIBS(attrib0, attrib1, attrib2);
4957 attriborigin = _mm_mul_ps(attrib1, w1);
4958 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4959 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4960 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4961 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4962 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4963 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4964 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4965 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4966 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4968 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4969 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4970 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4971 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4975 memset(triangle->mip, 0, sizeof(triangle->mip));
4976 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4978 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4979 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4981 texture = thread->texbound[texunit];
4982 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4984 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4985 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4986 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4987 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4988 // this will be multiplied in the texturing routine by the texture resolution
4989 y = _mm_cvtss_si32(mipdensity);
4992 y = (int)(log((float)y)*0.5f/M_LN2);
4993 if (y > texture->mipmaps - 1)
4994 y = texture->mipmaps - 1;
4995 triangle->mip[texunit] = y;
5001 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5004 __m128 xcoords, xslope;
5005 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5006 int yccmask = _mm_movemask_epi8(ycc);
5007 int edge0p, edge0n, edge1p, edge1n;
5014 case 0xFFFF: /*0000*/ y = endy; continue;
5015 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5016 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5017 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5018 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5019 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5020 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5021 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5022 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5023 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5024 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5025 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5026 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5027 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5028 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5029 case 0x0000: /*1111*/ y++; continue;
5037 case 0xFFFF: /*000*/ y = endy; continue;
5038 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5039 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5040 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5041 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5042 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5043 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5044 case 0x0000: /*111*/ y++; continue;
5047 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5048 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5049 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5050 nexty = _mm_extract_epi16(ycc, 0);
5051 if (nexty >= bandy) nexty = bandy-1;
5052 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5053 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5054 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5055 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5056 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5057 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5059 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5060 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5062 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5064 int startx, endx, offset;
5065 startx = _mm_cvtss_si32(xcoords);
5066 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5069 if (startx < 0) startx = 0;
5070 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5072 if (endx > maxx) endx = maxx;
5073 if (startx >= endx) continue;
5074 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5076 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5077 span->triangle = thread->numtriangles;
5080 span->startx = max(minx - offset, 0);
5081 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5082 if (span->startx >= span->endx)
5084 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5085 DPSOFTRAST_Draw_ProcessSpans(thread);
5090 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5092 DPSOFTRAST_Draw_ProcessSpans(thread);
5093 thread->numtriangles = 0;
5097 if (!ATOMIC_DECREMENT(command->refcount))
5099 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5100 MM_FREE(command->arrays);
5103 if (thread->numspans > 0 || thread->numtriangles > 0)
5105 DPSOFTRAST_Draw_ProcessSpans(thread);
5106 thread->numtriangles = 0;
5111 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5115 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5116 int datasize = 2*numvertices*sizeof(float[4]);
5117 DPSOFTRAST_Command_Draw *command;
5118 unsigned char *data;
5119 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5121 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5122 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5124 datasize += numvertices*sizeof(float[4]);
5127 datasize += numtriangles*sizeof(unsigned short[3]);
5129 datasize += numtriangles*sizeof(int[3]);
5130 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5131 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5133 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5134 data = (unsigned char *)MM_CALLOC(datasize, 1);
5138 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5139 data = (unsigned char *)command + commandsize;
5141 command->firstvertex = firstvertex;
5142 command->numvertices = numvertices;
5143 command->numtriangles = numtriangles;
5144 command->arrays = (float *)data;
5145 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5146 dpsoftrast.firstvertex = firstvertex;
5147 dpsoftrast.numvertices = numvertices;
5148 dpsoftrast.screencoord4f = (float *)data;
5149 data += numvertices*sizeof(float[4]);
5150 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5151 data += numvertices*sizeof(float[4]);
5152 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5154 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5155 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5157 dpsoftrast.post_array4f[j] = (float *)data;
5158 data += numvertices*sizeof(float[4]);
5160 command->element3i = NULL;
5161 command->element3s = NULL;
5164 command->element3s = (unsigned short *)data;
5165 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5169 command->element3i = (int *)data;
5170 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5175 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5177 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5178 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5179 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5180 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5181 if (command->starty >= command->endy)
5183 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5184 MM_FREE(command->arrays);
5185 DPSOFTRAST_UndoCommand(command->commandsize);
5188 command->clipped = dpsoftrast.drawclipped;
5189 command->refcount = dpsoftrast.numthreads;
5191 if (dpsoftrast.usethreads)
5194 DPSOFTRAST_Draw_SyncCommands();
5195 for (i = 0; i < dpsoftrast.numthreads; i++)
5197 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5198 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5199 Thread_CondSignal(thread->drawcond);
5204 DPSOFTRAST_Draw_FlushThreads();
5208 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5209 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5211 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5213 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5215 DPSOFTRAST_Command_SetRenderTargets *command;
5216 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5217 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5218 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5220 dpsoftrast.fb_width = width;
5221 dpsoftrast.fb_height = height;
5222 dpsoftrast.fb_depthpixels = depthpixels;
5223 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5224 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5225 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5226 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5227 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5228 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5229 command->width = width;
5230 command->height = height;
5233 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5235 int commandoffset = thread->commandoffset;
5236 while (commandoffset != endoffset)
5238 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5239 switch (command->opcode)
5241 #define INTERPCOMMAND(name) \
5242 case DPSOFTRAST_OPCODE_##name : \
5243 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5244 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5245 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5246 commandoffset = 0; \
5248 INTERPCOMMAND(Viewport)
5249 INTERPCOMMAND(ClearColor)
5250 INTERPCOMMAND(ClearDepth)
5251 INTERPCOMMAND(ColorMask)
5252 INTERPCOMMAND(DepthTest)
5253 INTERPCOMMAND(ScissorTest)
5254 INTERPCOMMAND(Scissor)
5255 INTERPCOMMAND(BlendFunc)
5256 INTERPCOMMAND(BlendSubtract)
5257 INTERPCOMMAND(DepthMask)
5258 INTERPCOMMAND(DepthFunc)
5259 INTERPCOMMAND(DepthRange)
5260 INTERPCOMMAND(PolygonOffset)
5261 INTERPCOMMAND(CullFace)
5262 INTERPCOMMAND(AlphaTest)
5263 INTERPCOMMAND(AlphaFunc)
5264 INTERPCOMMAND(SetTexture)
5265 INTERPCOMMAND(SetShader)
5266 INTERPCOMMAND(Uniform4f)
5267 INTERPCOMMAND(UniformMatrix4f)
5268 INTERPCOMMAND(Uniform1i)
5269 INTERPCOMMAND(SetRenderTargets)
5271 case DPSOFTRAST_OPCODE_Draw:
5272 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5273 commandoffset += command->commandsize;
5274 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5276 thread->commandoffset = commandoffset;
5279 case DPSOFTRAST_OPCODE_Reset:
5284 thread->commandoffset = commandoffset;
5287 static int DPSOFTRAST_Draw_Thread(void *data)
5289 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5290 while(thread->index >= 0)
5292 if (thread->commandoffset != dpsoftrast.drawcommand)
5294 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5298 Thread_LockMutex(thread->drawmutex);
5299 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5301 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5302 thread->starving = true;
5303 Thread_CondWait(thread->drawcond, thread->drawmutex);
5304 thread->starving = false;
5306 Thread_UnlockMutex(thread->drawmutex);
5312 static void DPSOFTRAST_Draw_FlushThreads(void)
5314 DPSOFTRAST_State_Thread *thread;
5316 DPSOFTRAST_Draw_SyncCommands();
5317 if (dpsoftrast.usethreads)
5319 for (i = 0; i < dpsoftrast.numthreads; i++)
5321 thread = &dpsoftrast.threads[i];
5322 if (thread->commandoffset != dpsoftrast.drawcommand)
5324 Thread_LockMutex(thread->drawmutex);
5325 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5326 Thread_CondSignal(thread->drawcond);
5327 Thread_UnlockMutex(thread->drawmutex);
5330 for (i = 0; i < dpsoftrast.numthreads; i++)
5332 thread = &dpsoftrast.threads[i];
5333 if (thread->commandoffset != dpsoftrast.drawcommand)
5335 Thread_LockMutex(thread->drawmutex);
5336 if (thread->commandoffset != dpsoftrast.drawcommand)
5338 thread->waiting = true;
5339 Thread_CondWait(thread->waitcond, thread->drawmutex);
5340 thread->waiting = false;
5342 Thread_UnlockMutex(thread->drawmutex);
5348 for (i = 0; i < dpsoftrast.numthreads; i++)
5350 thread = &dpsoftrast.threads[i];
5351 if (thread->commandoffset != dpsoftrast.drawcommand)
5352 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5355 dpsoftrast.commandpool.usedcommands = 0;
5358 void DPSOFTRAST_Flush(void)
5360 DPSOFTRAST_Draw_FlushThreads();
5363 void DPSOFTRAST_Finish(void)
5368 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5378 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5379 dpsoftrast.bigendian = u.b[3];
5380 dpsoftrast.fb_width = width;
5381 dpsoftrast.fb_height = height;
5382 dpsoftrast.fb_depthpixels = depthpixels;
5383 dpsoftrast.fb_colorpixels[0] = colorpixels;
5384 dpsoftrast.fb_colorpixels[1] = NULL;
5385 dpsoftrast.fb_colorpixels[1] = NULL;
5386 dpsoftrast.fb_colorpixels[1] = NULL;
5387 dpsoftrast.viewport[0] = 0;
5388 dpsoftrast.viewport[1] = 0;
5389 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5390 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5391 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5392 dpsoftrast.texture_firstfree = 1;
5393 dpsoftrast.texture_end = 1;
5394 dpsoftrast.texture_max = 0;
5395 dpsoftrast.color[0] = 1;
5396 dpsoftrast.color[1] = 1;
5397 dpsoftrast.color[2] = 1;
5398 dpsoftrast.color[3] = 1;
5399 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5400 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5401 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5402 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5403 for (i = 0; i < dpsoftrast.numthreads; i++)
5405 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5407 thread->cullface = GL_BACK;
5408 thread->colormask[1] = 1;
5409 thread->colormask[2] = 1;
5410 thread->colormask[3] = 1;
5411 thread->blendfunc[0] = GL_ONE;
5412 thread->blendfunc[1] = GL_ZERO;
5413 thread->depthmask = true;
5414 thread->depthtest = true;
5415 thread->depthfunc = GL_LEQUAL;
5416 thread->scissortest = false;
5417 thread->alphatest = false;
5418 thread->alphafunc = GL_GREATER;
5419 thread->alphavalue = 0.5f;
5420 thread->viewport[0] = 0;
5421 thread->viewport[1] = 0;
5422 thread->viewport[2] = dpsoftrast.fb_width;
5423 thread->viewport[3] = dpsoftrast.fb_height;
5424 thread->scissor[0] = 0;
5425 thread->scissor[1] = 0;
5426 thread->scissor[2] = dpsoftrast.fb_width;
5427 thread->scissor[3] = dpsoftrast.fb_height;
5428 thread->depthrange[0] = 0;
5429 thread->depthrange[1] = 1;
5430 thread->polygonoffset[0] = 0;
5431 thread->polygonoffset[1] = 0;
5433 DPSOFTRAST_RecalcThread(thread);
5435 thread->numspans = 0;
5436 thread->numtriangles = 0;
5437 thread->commandoffset = 0;
5438 thread->waiting = false;
5439 thread->starving = false;
5441 thread->validate = -1;
5442 DPSOFTRAST_Validate(thread, -1);
5444 if (dpsoftrast.usethreads)
5446 thread->waitcond = Thread_CreateCond();
5447 thread->drawcond = Thread_CreateCond();
5448 thread->drawmutex = Thread_CreateMutex();
5449 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5455 void DPSOFTRAST_Shutdown(void)
5458 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5460 DPSOFTRAST_State_Thread *thread;
5461 for (i = 0; i < dpsoftrast.numthreads; i++)
5463 thread = &dpsoftrast.threads[i];
5464 Thread_LockMutex(thread->drawmutex);
5466 Thread_CondSignal(thread->drawcond);
5467 Thread_UnlockMutex(thread->drawmutex);
5468 Thread_WaitThread(thread->thread, 0);
5469 Thread_DestroyCond(thread->waitcond);
5470 Thread_DestroyCond(thread->drawcond);
5471 Thread_DestroyMutex(thread->drawmutex);
5474 for (i = 0;i < dpsoftrast.texture_end;i++)
5475 if (dpsoftrast.texture[i].bytes)
5476 MM_FREE(dpsoftrast.texture[i].bytes);
5477 if (dpsoftrast.texture)
5478 free(dpsoftrast.texture);
5479 if (dpsoftrast.threads)
5480 MM_FREE(dpsoftrast.threads);
5481 memset(&dpsoftrast, 0, sizeof(dpsoftrast));