]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
dpsoftrast: fix off-by-0.5 in GL_NEAREST
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238
239         int shader_mode;
240         int shader_permutation;
241         int shader_exactspecularmath;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_scissor[4];
254         ALIGN(float fb_viewportcenter[4]);
255         ALIGN(float fb_viewportscale[4]);
256
257         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258         int fb_depthfunc;
259
260         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261         int fb_blendmode;
262
263         // band boundaries
264         int miny1;
265         int maxy1;
266         int miny2;
267         int maxy2;
268
269         ATOMIC(volatile int commandoffset);
270
271         volatile bool waiting;
272         volatile bool starving;
273         void *waitcond;
274         void *drawcond;
275         void *drawmutex;
276
277         int numspans;
278         int numtriangles;
279         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
281 }
282 DPSOFTRAST_State_Thread);
283
284 typedef ATOMIC(struct DPSOFTRAST_State_s
285 {
286         int fb_width;
287         int fb_height;
288         unsigned int *fb_depthpixels;
289         unsigned int *fb_colorpixels[4];
290
291         int viewport[4];
292         ALIGN(float fb_viewportcenter[4]);
293         ALIGN(float fb_viewportscale[4]);
294
295         float color[4];
296         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
298
299         const float *pointer_vertex3f;
300         const float *pointer_color4f;
301         const unsigned char *pointer_color4ub;
302         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         int stride_vertex;
304         int stride_color;
305         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
308
309         int firstvertex;
310         int numvertices;
311         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312         float *screencoord4f;
313         int drawstarty;
314         int drawendy;
315         int drawclipped;
316         
317         int shader_mode;
318         int shader_permutation;
319         int shader_exactspecularmath;
320
321         int texture_max;
322         int texture_end;
323         int texture_firstfree;
324         DPSOFTRAST_Texture *texture;
325
326         int bigendian;
327
328         // error reporting
329         const char *errorstring;
330
331         bool usethreads;
332         int interlace;
333         int numthreads;
334         DPSOFTRAST_State_Thread *threads;
335
336         ATOMIC(volatile int drawcommand);
337
338         DPSOFTRAST_State_Command_Pool commandpool;
339 }
340 DPSOFTRAST_State);
341
342 DPSOFTRAST_State dpsoftrast;
343
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
349
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
351 {
352         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354         fb_viewportcenter[3] = 0.5f;
355         fb_viewportcenter[0] = 0.0f;
356         fb_viewportscale[1] = 0.5f * viewport[2];
357         fb_viewportscale[2] = -0.5f * viewport[3];
358         fb_viewportscale[3] = 0.5f;
359         fb_viewportscale[0] = 1.0f;
360 }
361
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
363 {
364         if (dpsoftrast.interlace)
365         {
366                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370         }
371         else
372         {
373                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375         }
376 }
377
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
379 {
380         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381         // and viewport projection values
382         int x1, x2;
383         int y1, y2;
384         x1 = thread->scissor[0];
385         x2 = thread->scissor[0] + thread->scissor[2];
386         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387         y2 = dpsoftrast.fb_height - thread->scissor[1];
388         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
389         if (x1 < 0) x1 = 0;
390         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
391         if (y1 < 0) y1 = 0;
392         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393         thread->fb_scissor[0] = x1;
394         thread->fb_scissor[1] = y1;
395         thread->fb_scissor[2] = x2 - x1;
396         thread->fb_scissor[3] = y2 - y1;
397
398         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399         DPSOFTRAST_RecalcThread(thread);
400 }
401
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
403 {
404         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
405 }
406
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
408 {
409         if (thread->blendsubtract)
410         {
411                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
412                 {
413                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417                 }
418         }
419         else
420         {       
421                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
422                 {
423                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434                 }
435         }
436 }
437
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
439
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
441 {
442         mask &= thread->validate;
443         if (!mask)
444                 return;
445         if (mask & DPSOFTRAST_VALIDATE_FB)
446         {
447                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448                 DPSOFTRAST_RecalcFB(thread);
449         }
450         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
451         {
452                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453                 DPSOFTRAST_RecalcDepthFunc(thread);
454         }
455         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
456         {
457                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458                 DPSOFTRAST_RecalcBlendFunc(thread);
459         }
460 }
461
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
463 {
464         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465                 return &dpsoftrast.texture[index];
466         return NULL;
467 }
468
469 static void DPSOFTRAST_Texture_Grow(void)
470 {
471         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472         DPSOFTRAST_State_Thread *thread;
473         int i;
474         int j;
475         DPSOFTRAST_Flush();
476         // expand texture array as needed
477         if (dpsoftrast.texture_max < 1024)
478                 dpsoftrast.texture_max = 1024;
479         else
480                 dpsoftrast.texture_max *= 2;
481         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483                 if (dpsoftrast.texbound[i])
484                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485         for (j = 0; j < dpsoftrast.numthreads; j++)
486         {
487                 thread = &dpsoftrast.threads[j];
488                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489                         if (thread->texbound[i])
490                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
491         }
492 }
493
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
495 {
496         int w;
497         int h;
498         int d;
499         int size;
500         int s;
501         int texnum;
502         int mipmaps;
503         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505         DPSOFTRAST_Texture *texture;
506         if (width*height*depth < 1)
507         {
508                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
509                 return 0;
510         }
511         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
512         {
513                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514                 return 0;
515         }
516         switch(texformat)
517         {
518         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
521                 break;
522         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
524                 {
525                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526                         return 0;
527                 }
528                 if (depth != 1)
529                 {
530                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531                         return 0;
532                 }
533                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
534                 {
535                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536                         return 0;
537                 }
538                 break;
539         }
540         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
541         {
542                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
543                 return 0;
544         }
545         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
546         {
547                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
548                 return 0;
549         }
550         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
551         {
552                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
553                 return 0;
554         }
555         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
556         {
557                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
558                 return 0;
559         }
560         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
561         {
562                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
563                 return 0;
564         }
565         // find first empty slot in texture array
566         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567                 if (!dpsoftrast.texture[texnum].bytes)
568                         break;
569         dpsoftrast.texture_firstfree = texnum + 1;
570         if (dpsoftrast.texture_max <= texnum)
571                 DPSOFTRAST_Texture_Grow();
572         if (dpsoftrast.texture_end <= texnum)
573                 dpsoftrast.texture_end = texnum + 1;
574         texture = &dpsoftrast.texture[texnum];
575         memset(texture, 0, sizeof(*texture));
576         texture->flags = flags;
577         texture->width = width;
578         texture->height = height;
579         texture->depth = depth;
580         texture->sides = sides;
581         texture->binds = 0;
582         w = width;
583         h = height;
584         d = depth;
585         size = 0;
586         mipmaps = 0;
587         w = width;
588         h = height;
589         d = depth;
590         for (;;)
591         {
592                 s = w * h * d * sides * 4;
593                 texture->mipmap[mipmaps][0] = size;
594                 texture->mipmap[mipmaps][1] = s;
595                 texture->mipmap[mipmaps][2] = w;
596                 texture->mipmap[mipmaps][3] = h;
597                 texture->mipmap[mipmaps][4] = d;
598                 size += s;
599                 mipmaps++;
600                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
601                         break;
602                 if (w > 1) w >>= 1;
603                 if (h > 1) h >>= 1;
604                 if (d > 1) d >>= 1;
605         }
606         texture->mipmaps = mipmaps;
607         texture->size = size;
608
609         // allocate the pixels now
610         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
611
612         return texnum;
613 }
614 void DPSOFTRAST_Texture_Free(int index)
615 {
616         DPSOFTRAST_Texture *texture;
617         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
618         if (texture->binds)
619                 DPSOFTRAST_Flush();
620         if (texture->bytes)
621                 MM_FREE(texture->bytes);
622         texture->bytes = NULL;
623         memset(texture, 0, sizeof(*texture));
624         // adjust the free range and used range
625         if (dpsoftrast.texture_firstfree > index)
626                 dpsoftrast.texture_firstfree = index;
627         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628                 dpsoftrast.texture_end--;
629 }
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
631 {
632         int i, x, y, z, w, layer0, layer1, row0, row1;
633         unsigned char *o, *i0, *i1, *i2, *i3;
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->mipmaps <= 1)
637                 return;
638         for (i = 1;i < texture->mipmaps;i++)
639         {
640                 for (z = 0;z < texture->mipmap[i][4];z++)
641                 {
642                         layer0 = z*2;
643                         layer1 = z*2+1;
644                         if (layer1 >= texture->mipmap[i-1][4])
645                                 layer1 = texture->mipmap[i-1][4]-1;
646                         for (y = 0;y < texture->mipmap[i][3];y++)
647                         {
648                                 row0 = y*2;
649                                 row1 = y*2+1;
650                                 if (row1 >= texture->mipmap[i-1][3])
651                                         row1 = texture->mipmap[i-1][3]-1;
652                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
653                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657                                 w = texture->mipmap[i][2];
658                                 if (layer1 > layer0)
659                                 {
660                                         if (texture->mipmap[i-1][2] > 1)
661                                         {
662                                                 // average 3D texture
663                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
664                                                 {
665                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669                                                 }
670                                         }
671                                         else
672                                         {
673                                                 // average 3D mipmap with parent width == 1
674                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
675                                                 {
676                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
680                                                 }
681                                         }
682                                 }
683                                 else
684                                 {
685                                         if (texture->mipmap[i-1][2] > 1)
686                                         {
687                                                 // average 2D texture (common case)
688                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
689                                                 {
690                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694                                                 }
695                                         }
696                                         else
697                                         {
698                                                 // 2D texture with parent width == 1
699                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
700                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
701                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
702                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
703                                         }
704                                 }
705                         }
706                 }
707         }
708 }
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
710 {
711         DPSOFTRAST_Texture *texture;
712         unsigned char *dst;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714         if (texture->binds)
715                 DPSOFTRAST_Flush();
716         if (pixels)
717         {
718                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719                 while (blockheight > 0)
720                 {
721                         memcpy(dst, pixels, blockwidth * 4);
722                         pixels += blockwidth * 4;
723                         dst += texture->mipmap[0][2] * 4;
724                         blockheight--;
725                 }
726         }
727         DPSOFTRAST_Texture_CalculateMipmaps(index);
728 }
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         if (pixels)
736                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737         DPSOFTRAST_Texture_CalculateMipmaps(index);
738 }
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
740 {
741         DPSOFTRAST_Texture *texture;
742         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743         return texture->mipmap[mip][2];
744 }
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
746 {
747         DPSOFTRAST_Texture *texture;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749         return texture->mipmap[mip][3];
750 }
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][4];
756 }
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         if (texture->binds)
762                 DPSOFTRAST_Flush();
763         return texture->bytes + texture->mipmap[mip][0];
764 }
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
770         {
771                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772                 return;
773         }
774         if (texture->binds)
775                 DPSOFTRAST_Flush();
776         texture->filter = filter;
777 }
778
779 static void DPSOFTRAST_Draw_FlushThreads(void);
780
781 static void DPSOFTRAST_Draw_SyncCommands(void)
782 {
783         if(dpsoftrast.usethreads) MEMORY_BARRIER;
784         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
785 }
786
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
788 {
789         DPSOFTRAST_State_Thread *thread;
790         int i;
791         int freecommand = dpsoftrast.commandpool.freecommand;
792         int usedcommands = dpsoftrast.commandpool.usedcommands;
793         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
794                 return;
795         DPSOFTRAST_Draw_SyncCommands();
796         for(;;)
797         {
798                 int waitindex = -1;
799                 int commandoffset;
800                 usedcommands = 0;
801                 for (i = 0; i < dpsoftrast.numthreads; i++)
802                 {
803                         thread = &dpsoftrast.threads[i]; 
804                         commandoffset = freecommand - thread->commandoffset;
805                         if (commandoffset < 0)
806                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807                         if (commandoffset > usedcommands)
808                         {
809                                 waitindex = i;
810                                 usedcommands = commandoffset;
811                         }
812                 }
813                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
814                         break;
815                 thread = &dpsoftrast.threads[waitindex];
816                 Thread_LockMutex(thread->drawmutex);
817                 if (thread->commandoffset != dpsoftrast.drawcommand)
818                 {
819                         thread->waiting = true;
820                         if (thread->starving) Thread_CondSignal(thread->drawcond);
821                         Thread_CondWait(thread->waitcond, thread->drawmutex);
822                         thread->waiting = false;
823                 }
824                 Thread_UnlockMutex(thread->drawmutex);
825         }
826         dpsoftrast.commandpool.usedcommands = usedcommands;
827 }
828
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
833
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
835 {
836         DPSOFTRAST_Command *command;
837         int freecommand = dpsoftrast.commandpool.freecommand;
838         int usedcommands = dpsoftrast.commandpool.usedcommands;
839         int extra = sizeof(DPSOFTRAST_Command);
840         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
843         {
844                 if (dpsoftrast.usethreads)
845                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
846                 else
847                         DPSOFTRAST_Draw_FlushThreads();
848                 freecommand = dpsoftrast.commandpool.freecommand;
849                 usedcommands = dpsoftrast.commandpool.usedcommands;
850         }
851         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
852         {
853                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854                 command->opcode = DPSOFTRAST_OPCODE_Reset;
855                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
856                 freecommand = 0;
857         }
858         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859         command->opcode = opcode;
860         command->commandsize = size;
861         freecommand += size;
862         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
863                 freecommand = 0;
864         dpsoftrast.commandpool.freecommand = freecommand;
865         dpsoftrast.commandpool.usedcommands = usedcommands + size;
866         return command;
867 }
868
869 static void DPSOFTRAST_UndoCommand(int size)
870 {
871         int freecommand = dpsoftrast.commandpool.freecommand;
872         int usedcommands = dpsoftrast.commandpool.usedcommands;
873         freecommand -= size;
874         if (freecommand < 0)
875                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876         usedcommands -= size;
877         dpsoftrast.commandpool.freecommand = freecommand;
878         dpsoftrast.commandpool.usedcommands = usedcommands;
879 }
880                 
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
883 {
884         thread->viewport[0] = command->x;
885         thread->viewport[1] = command->y;
886         thread->viewport[2] = command->width;
887         thread->viewport[3] = command->height;
888         thread->validate |= DPSOFTRAST_VALIDATE_FB;
889 }
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
891 {
892         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
893         command->x = x;
894         command->y = y;
895         command->width = width;
896         command->height = height;
897
898         dpsoftrast.viewport[0] = x;
899         dpsoftrast.viewport[1] = y;
900         dpsoftrast.viewport[2] = width;
901         dpsoftrast.viewport[3] = height;
902         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
903 }
904
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
907 {
908         int i, x1, y1, x2, y2, w, h, x, y;
909         int miny1, maxy1, miny2, maxy2;
910         int bandy;
911         unsigned int *p;
912         unsigned int c;
913         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914         miny1 = thread->miny1;
915         maxy1 = thread->maxy1;
916         miny2 = thread->miny2;
917         maxy2 = thread->maxy2;
918         x1 = thread->fb_scissor[0];
919         y1 = thread->fb_scissor[1];
920         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922         if (y1 < miny1) y1 = miny1;
923         if (y2 > maxy2) y2 = maxy2;
924         w = x2 - x1;
925         h = y2 - y1;
926         if (w < 1 || h < 1)
927                 return;
928         // FIXME: honor fb_colormask?
929         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930         for (i = 0;i < 4;i++)
931         {
932                 if (!dpsoftrast.fb_colorpixels[i])
933                         continue;
934                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
935                 for (;y < bandy;y++)
936                 {
937                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938                         for (x = x1;x < x2;x++)
939                                 p[x] = c;
940                 }
941         }
942 }
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
944 {
945         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
946         command->r = r;
947         command->g = g;
948         command->b = b;
949         command->a = a;
950 }
951
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
954 {
955         int x1, y1, x2, y2, w, h, x, y;
956         int miny1, maxy1, miny2, maxy2;
957         int bandy;
958         unsigned int *p;
959         unsigned int c;
960         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961         miny1 = thread->miny1;
962         maxy1 = thread->maxy1;
963         miny2 = thread->miny2;
964         maxy2 = thread->maxy2;
965         x1 = thread->fb_scissor[0];
966         y1 = thread->fb_scissor[1];
967         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969         if (y1 < miny1) y1 = miny1;
970         if (y2 > maxy2) y2 = maxy2;
971         w = x2 - x1;
972         h = y2 - y1;
973         if (w < 1 || h < 1)
974                 return;
975         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977         for (;y < bandy;y++)
978         {
979                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980                 for (x = x1;x < x2;x++)
981                         p[x] = c;
982         }
983 }
984 void DPSOFTRAST_ClearDepth(float d)
985 {
986         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
987         command->depth = d;
988 }
989
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
992 {
993         thread->colormask[0] = command->r != 0;
994         thread->colormask[1] = command->g != 0;
995         thread->colormask[2] = command->b != 0;
996         thread->colormask[3] = command->a != 0;
997         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
998 }
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1000 {
1001         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1002         command->r = r;
1003         command->g = g;
1004         command->b = b;
1005         command->a = a;
1006 }
1007
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1010 {
1011         thread->depthtest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1013 }
1014 void DPSOFTRAST_DepthTest(int enable)
1015 {
1016         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1022 {
1023         thread->scissortest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 }
1026 void DPSOFTRAST_ScissorTest(int enable)
1027 {
1028         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1034 {
1035         thread->scissor[0] = command->x;
1036         thread->scissor[1] = command->y;
1037         thread->scissor[2] = command->width;
1038         thread->scissor[3] = command->height;
1039         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1040 }
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1042 {
1043         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1044         command->x = x;
1045         command->y = y;
1046         command->width = width;
1047         command->height = height;
1048 }
1049
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1052 {
1053         thread->blendfunc[0] = command->sfactor;
1054         thread->blendfunc[1] = command->dfactor;
1055         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 }
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1058 {
1059         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060         command->sfactor = sfactor;
1061         command->dfactor = dfactor;
1062 }
1063
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1066 {
1067         thread->blendsubtract = command->enable;
1068         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 }
1070 void DPSOFTRAST_BlendSubtract(int enable)
1071 {
1072         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073         command->enable = enable;
1074 }
1075
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1078 {
1079         thread->depthmask = command->enable;
1080 }
1081 void DPSOFTRAST_DepthMask(int enable)
1082 {
1083         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084         command->enable = enable;
1085 }
1086
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1089 {
1090         thread->depthfunc = command->func;
1091 }
1092 void DPSOFTRAST_DepthFunc(int func)
1093 {
1094         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095         command->func = func;
1096 }
1097
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1100 {
1101         thread->depthrange[0] = command->nearval;
1102         thread->depthrange[1] = command->farval;
1103 }
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1105 {
1106         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107         command->nearval = nearval;
1108         command->farval = farval;
1109 }
1110
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1113 {
1114         thread->polygonoffset[0] = command->alongnormal;
1115         thread->polygonoffset[1] = command->intoview;
1116 }
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1118 {
1119         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120         command->alongnormal = alongnormal;
1121         command->intoview = intoview;
1122 }
1123
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1126 {
1127         thread->cullface = command->mode;
1128 }
1129 void DPSOFTRAST_CullFace(int mode)
1130 {
1131         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132         command->mode = mode;
1133 }
1134
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1137 {
1138         thread->alphatest = command->enable;
1139 }
1140 void DPSOFTRAST_AlphaTest(int enable)
1141 {
1142         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143         command->enable = enable;
1144 }
1145
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1148 {
1149         thread->alphafunc = command->func;
1150         thread->alphavalue = command->ref;
1151 }
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1153 {
1154         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155         command->func = func;
1156         command->ref = ref;
1157 }
1158
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1160 {
1161         dpsoftrast.color[0] = r;
1162         dpsoftrast.color[1] = g;
1163         dpsoftrast.color[2] = b;
1164         dpsoftrast.color[3] = a;
1165 }
1166
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1168 {
1169         int outstride = blockwidth * 4;
1170         int instride = dpsoftrast.fb_width * 4;
1171         int bx1 = blockx;
1172         int by1 = blocky;
1173         int bx2 = blockx + blockwidth;
1174         int by2 = blocky + blockheight;
1175         int bw;
1176         int x;
1177         int y;
1178         unsigned char *inpixels;
1179         unsigned char *b;
1180         unsigned char *o;
1181         DPSOFTRAST_Flush();
1182         if (bx1 < 0) bx1 = 0;
1183         if (by1 < 0) by1 = 0;
1184         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1186         bw = bx2 - bx1;
1187         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188         if (dpsoftrast.bigendian)
1189         {
1190                 for (y = by1;y < by2;y++)
1191                 {
1192                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1194                         for (x = bx1;x < bx2;x++)
1195                         {
1196                                 o[0] = b[3];
1197                                 o[1] = b[2];
1198                                 o[2] = b[1];
1199                                 o[3] = b[0];
1200                                 o += 4;
1201                                 b += 4;
1202                         }
1203                 }
1204         }
1205         else
1206         {
1207                 for (y = by1;y < by2;y++)
1208                 {
1209                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1211                         memcpy(o, b, bw*4);
1212                 }
1213         }
1214
1215 }
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1217 {
1218         int tx1 = tx;
1219         int ty1 = ty;
1220         int tx2 = tx + width;
1221         int ty2 = ty + height;
1222         int sx1 = sx;
1223         int sy1 = sy;
1224         int sx2 = sx + width;
1225         int sy2 = sy + height;
1226         int swidth;
1227         int sheight;
1228         int twidth;
1229         int theight;
1230         int sw;
1231         int sh;
1232         int tw;
1233         int th;
1234         int y;
1235         unsigned int *spixels;
1236         unsigned int *tpixels;
1237         DPSOFTRAST_Texture *texture;
1238         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239         if (mip < 0 || mip >= texture->mipmaps) return;
1240         DPSOFTRAST_Flush();
1241         spixels = dpsoftrast.fb_colorpixels[0];
1242         swidth = dpsoftrast.fb_width;
1243         sheight = dpsoftrast.fb_height;
1244         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245         twidth = texture->mipmap[mip][2];
1246         theight = texture->mipmap[mip][3];
1247         if (tx1 < 0) tx1 = 0;
1248         if (ty1 < 0) ty1 = 0;
1249         if (tx2 > twidth) tx2 = twidth;
1250         if (ty2 > theight) ty2 = theight;
1251         if (sx1 < 0) sx1 = 0;
1252         if (sy1 < 0) sy1 = 0;
1253         if (sx2 > swidth) sx2 = swidth;
1254         if (sy2 > sheight) sy2 = sheight;
1255         tw = tx2 - tx1;
1256         th = ty2 - ty1;
1257         sw = sx2 - sx1;
1258         sh = sy2 - sy1;
1259         if (tw > sw) tw = sw;
1260         if (th > sh) th = sh;
1261         if (tw < 1 || th < 1)
1262                 return;
1263         sy1 = sheight - 1 - sy1;
1264         for (y = 0;y < th;y++)
1265                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266         if (texture->mipmaps > 1)
1267                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1268 }
1269
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1272 {
1273         if (thread->texbound[command->unitnum])
1274                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275         thread->texbound[command->unitnum] = command->texture;
1276 }
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1278 {
1279         DPSOFTRAST_Command_SetTexture *command;
1280         DPSOFTRAST_Texture *texture;
1281         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1282         {
1283                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1284                 return;
1285         }
1286         texture = DPSOFTRAST_Texture_GetByIndex(index);
1287         if (index && !texture)
1288         {
1289                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1290                 return;
1291         }
1292
1293         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294         command->unitnum = unitnum;
1295         command->texture = texture;
1296
1297         dpsoftrast.texbound[unitnum] = texture;
1298         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1299 }
1300
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1302 {
1303         dpsoftrast.pointer_vertex3f = vertex3f;
1304         dpsoftrast.stride_vertex = stride;
1305 }
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1307 {
1308         dpsoftrast.pointer_color4f = color4f;
1309         dpsoftrast.pointer_color4ub = NULL;
1310         dpsoftrast.stride_color = stride;
1311 }
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1313 {
1314         dpsoftrast.pointer_color4f = NULL;
1315         dpsoftrast.pointer_color4ub = color4ub;
1316         dpsoftrast.stride_color = stride;
1317 }
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1319 {
1320         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322         dpsoftrast.stride_texcoord[unitnum] = stride;
1323 }
1324
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1327 {
1328         thread->shader_mode = command->mode;
1329         thread->shader_permutation = command->permutation;
1330         thread->shader_exactspecularmath = command->exactspecularmath;
1331 }
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1333 {
1334         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335         command->mode = mode;
1336         command->permutation = permutation;
1337         command->exactspecularmath = exactspecularmath;
1338
1339         dpsoftrast.shader_mode = mode;
1340         dpsoftrast.shader_permutation = permutation;
1341         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1342 }
1343
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1346 {
1347         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1348 }
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1350 {
1351         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352         command->index = index;
1353         command->val[0] = v0;
1354         command->val[1] = v1;
1355         command->val[2] = v2;
1356         command->val[3] = v3;
1357
1358         dpsoftrast.uniform4f[index*4+0] = v0;
1359         dpsoftrast.uniform4f[index*4+1] = v1;
1360         dpsoftrast.uniform4f[index*4+2] = v2;
1361         dpsoftrast.uniform4f[index*4+3] = v3;
1362 }
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1364 {
1365         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366         command->index = index;
1367         memcpy(command->val, v, sizeof(command->val));
1368
1369         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1370 }
1371
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1374 {
1375         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 }
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1378 {
1379 #ifdef SSE_POSSIBLE
1380         int i, index;
1381         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1382         {
1383                 __m128 m0, m1, m2, m3;
1384                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385                 command->index = (DPSOFTRAST_UNIFORM)index;
1386                 if (((size_t)v)&(ALIGN_SIZE-1))
1387                 {
1388                         m0 = _mm_loadu_ps(v);
1389                         m1 = _mm_loadu_ps(v+4);
1390                         m2 = _mm_loadu_ps(v+8);
1391                         m3 = _mm_loadu_ps(v+12);
1392                 }
1393                 else
1394                 {
1395                         m0 = _mm_load_ps(v);
1396                         m1 = _mm_load_ps(v+4);
1397                         m2 = _mm_load_ps(v+8);
1398                         m3 = _mm_load_ps(v+12);
1399                 }
1400                 if (transpose)
1401                 {
1402                         __m128 t0, t1, t2, t3;
1403                         t0 = _mm_unpacklo_ps(m0, m1);
1404                         t1 = _mm_unpacklo_ps(m2, m3);
1405                         t2 = _mm_unpackhi_ps(m0, m1);
1406                         t3 = _mm_unpackhi_ps(m2, m3);
1407                         m0 = _mm_movelh_ps(t0, t1);
1408                         m1 = _mm_movehl_ps(t1, t0);
1409                         m2 = _mm_movelh_ps(t2, t3);
1410                         m3 = _mm_movehl_ps(t3, t2);                     
1411                 }
1412                 _mm_store_ps(command->val, m0);
1413                 _mm_store_ps(command->val+4, m1);
1414                 _mm_store_ps(command->val+8, m2);
1415                 _mm_store_ps(command->val+12, m3);
1416                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420         }
1421 #endif
1422 }
1423
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1426 {
1427         thread->uniform1i[command->index] = command->val;
1428 }
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1430 {
1431         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432         command->index = index;
1433         command->val = i0;
1434
1435         dpsoftrast.uniform1i[command->index] = i0;
1436 }
1437
1438 #ifdef SSE_POSSIBLE
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 {
1441         float *end = dst + size*4;
1442         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1443         {
1444                 while (dst < end)
1445                 {
1446                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447                         dst += 4;
1448                         src += stride;
1449                 }
1450         }
1451         else
1452         {
1453                 while (dst < end)
1454                 {
1455                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1456                         dst += 4;
1457                         src += stride;
1458                 }
1459         }
1460 }
1461
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1463 {
1464         float *end = dst + size*4;
1465         if (stride == sizeof(float[3]))
1466         {
1467                 float *end4 = dst + (size&~3)*4;        
1468                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1469                 {
1470                         while (dst < end4)
1471                         {
1472                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1473                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dst += 16;
1486                                 src += 4*sizeof(float[3]);
1487                         }
1488                 }
1489                 else
1490                 {
1491                         while (dst < end4)
1492                         {
1493                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dst += 16;
1507                                 src += 4*sizeof(float[3]);
1508                         }
1509                 }
1510         }
1511         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1512         {
1513                 while (dst < end)
1514                 {
1515                         __m128 v = _mm_loadu_ps((const float *)src);
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519                         _mm_store_ps(dst, v);
1520                         dst += 4;
1521                         src += stride;
1522                 }
1523         }
1524         else
1525         {
1526                 while (dst < end)
1527                 {
1528                         __m128 v = _mm_load_ps((const float *)src);
1529                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532                         _mm_store_ps(dst, v);
1533                         dst += 4;
1534                         src += stride;
1535                 }
1536         }
1537 }
1538
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1540 {
1541         float *end = dst + size*4;
1542         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543         if (stride == sizeof(float[2]))
1544         {
1545                 float *end2 = dst + (size&~1)*4;
1546                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1547                 {
1548                         while (dst < end2)
1549                         {
1550                                 __m128 v = _mm_loadu_ps((const float *)src);
1551                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553                                 dst += 8;
1554                                 src += 2*sizeof(float[2]);
1555                         }
1556                 }
1557                 else
1558                 {
1559                         while (dst < end2)
1560                         {
1561                                 __m128 v = _mm_load_ps((const float *)src);
1562                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1564                                 dst += 8;
1565                                 src += 2*sizeof(float[2]);
1566                         }
1567                 }
1568         }
1569         while (dst < end)
1570         {
1571                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1572                 dst += 4;
1573                 src += stride;
1574         }
1575 }
1576
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 {
1579         float *end = dst + size*4;
1580         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581         if (stride == sizeof(unsigned char[4]))
1582         {
1583                 float *end4 = dst + (size&~3)*4;
1584                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585                 {
1586                         while (dst < end4)
1587                         {
1588                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1593                                 dst += 16;
1594                                 src += 4*sizeof(unsigned char[4]);
1595                         }
1596                 }
1597                 else
1598                 {
1599                         while (dst < end4)
1600                         {
1601                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1606                                 dst += 16;
1607                                 src += 4*sizeof(unsigned char[4]);
1608                         }
1609                 }
1610         }
1611         while (dst < end)
1612         {
1613                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1615                 dst += 4;
1616                 src += stride;
1617         }
1618 }
1619
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1621 {
1622         float *end = dst + 4*size;
1623         __m128 v = _mm_loadu_ps(src);
1624         while (dst < end)
1625         {
1626                 _mm_store_ps(dst, v);
1627                 dst += 4;
1628         }
1629 }
1630 #endif
1631
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1633 {
1634 #ifdef SSE_POSSIBLE
1635         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636         __m128 m0, m1, m2, m3;
1637         float *end;
1638         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1639         {
1640                 // fast case for identity matrix
1641                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1642                 return;
1643         }
1644         end = out4f + numitems*4;
1645         m0 = _mm_loadu_ps(inmatrix16f);
1646         m1 = _mm_loadu_ps(inmatrix16f + 4);
1647         m2 = _mm_loadu_ps(inmatrix16f + 8);
1648         m3 = _mm_loadu_ps(inmatrix16f + 12);
1649         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1650         {
1651                 while (out4f < end)
1652                 {
1653                         __m128 v = _mm_loadu_ps(in4f);
1654                         _mm_store_ps(out4f,
1655                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659                         out4f += 4;
1660                         in4f += 4;
1661                 }
1662         }
1663         else
1664         {
1665                 while (out4f < end)
1666                 {
1667                         __m128 v = _mm_load_ps(in4f);
1668                         _mm_store_ps(out4f,
1669                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1673                         out4f += 4;
1674                         in4f += 4;
1675                 }
1676         }
1677 #endif
1678 }
1679
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1681 {
1682         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1683 }
1684
1685 #ifdef SSE_POSSIBLE
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1695 { \
1696         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1700 }
1701
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1703 { \
1704         __m128 p = (in); \
1705         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1709 }
1710
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1712 {
1713         int clipmask = 0xFF;
1714         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722         #define BBFRONT(k, pos) \
1723         { \
1724                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1727                 { \
1728                         __m128 proj; \
1729                         clipmask &= ~(1<<k); \
1730                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731                         minproj = _mm_min_ss(minproj, proj); \
1732                         maxproj = _mm_max_ss(maxproj, proj); \
1733                 } \
1734         }
1735         BBFRONT(0, minpos); 
1736         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1737         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1738         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1739         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1740         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1741         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1742         BBFRONT(7, maxpos);
1743         #define BBCLIP(k) \
1744         { \
1745                 if (clipmask&(1<<k)) \
1746                 { \
1747                         if (!(clipmask&(1<<(k^1)))) \
1748                         { \
1749                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752                                 minproj = _mm_min_ss(minproj, proj); \
1753                                 maxproj = _mm_max_ss(maxproj, proj); \
1754                         } \
1755                         if (!(clipmask&(1<<(k^2)))) \
1756                         { \
1757                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760                                 minproj = _mm_min_ss(minproj, proj); \
1761                                 maxproj = _mm_max_ss(maxproj, proj); \
1762                         } \
1763                         if (!(clipmask&(1<<(k^4)))) \
1764                         { \
1765                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768                                 minproj = _mm_min_ss(minproj, proj); \
1769                                 maxproj = _mm_max_ss(maxproj, proj); \
1770                         } \
1771                 } \
1772         }
1773         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780         *starty = _mm_cvttss_si32(maxproj);
1781         *endy = _mm_cvttss_si32(minproj)+1;
1782         return clipmask;
1783 }
1784         
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1786 {
1787         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788         float *end = out4f + numitems*4;
1789         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         __m128 minpos, maxpos;
1791         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1792         {
1793                 minpos = maxpos = _mm_loadu_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_loadu_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         else
1808         {
1809                 minpos = maxpos = _mm_load_ps(in4f);
1810                 while (out4f < end)
1811                 {
1812                         __m128 v = _mm_load_ps(in4f);
1813                         minpos = _mm_min_ps(minpos, v);
1814                         maxpos = _mm_max_ps(maxpos, v);
1815                         _mm_store_ps(out4f, v);
1816                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817                         _mm_store_ps(screen4f, v);
1818                         in4f += 4;
1819                         out4f += 4;
1820                         screen4f += 4;
1821                 }
1822         }
1823         if (starty && endy) 
1824         {
1825                 ALIGN(float minposf[4]);
1826                 ALIGN(float maxposf[4]);
1827                 _mm_store_ps(minposf, minpos);
1828                 _mm_store_ps(maxposf, maxpos);
1829                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830         }
1831         return 0;
1832 }
1833
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1835 {
1836         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1838         float *end;
1839         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841         end = out4f + numitems*4;
1842         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844         m0 = _mm_loadu_ps(inmatrix16f);
1845         m1 = _mm_loadu_ps(inmatrix16f + 4);
1846         m2 = _mm_loadu_ps(inmatrix16f + 8);
1847         m3 = _mm_loadu_ps(inmatrix16f + 12);
1848         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1849         {
1850                 minpos = maxpos = _mm_loadu_ps(in4f);
1851                 while (out4f < end)
1852                 {
1853                         __m128 v = _mm_loadu_ps(in4f);
1854                         minpos = _mm_min_ps(minpos, v);
1855                         maxpos = _mm_max_ps(maxpos, v);
1856                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857                         _mm_store_ps(out4f, v);
1858                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859                         _mm_store_ps(screen4f, v);
1860                         in4f += 4;
1861                         out4f += 4;
1862                         screen4f += 4;
1863                 }
1864         }
1865         else
1866         {
1867                 minpos = maxpos = _mm_load_ps(in4f);
1868                 while (out4f < end)
1869                 {
1870                         __m128 v = _mm_load_ps(in4f);
1871                         minpos = _mm_min_ps(minpos, v);
1872                         maxpos = _mm_max_ps(maxpos, v);
1873                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874                         _mm_store_ps(out4f, v);
1875                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876                         _mm_store_ps(screen4f, v);
1877                         in4f += 4;
1878                         out4f += 4;
1879                         screen4f += 4;
1880                 }
1881         }
1882         if (starty && endy) 
1883         {
1884                 ALIGN(float minposf[4]);
1885                 ALIGN(float maxposf[4]);
1886                 _mm_store_ps(minposf, minpos);
1887                 _mm_store_ps(maxposf, maxpos);
1888                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1889         }
1890         return 0;
1891 }
1892 #endif
1893
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1895 {
1896 #ifdef SSE_POSSIBLE
1897         float *outf = dpsoftrast.post_array4f[outarray];
1898         const unsigned char *inb;
1899         int firstvertex = dpsoftrast.firstvertex;
1900         int numvertices = dpsoftrast.numvertices;
1901         int stride;
1902         switch(inarray)
1903         {
1904         case DPSOFTRAST_ARRAY_POSITION:
1905                 stride = dpsoftrast.stride_vertex;
1906                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1908                 break;
1909         case DPSOFTRAST_ARRAY_COLOR:
1910                 stride = dpsoftrast.stride_color;
1911                 if (dpsoftrast.pointer_color4f)
1912                 {
1913                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1915                 }
1916                 else if (dpsoftrast.pointer_color4ub)
1917                 {
1918                         stride = dpsoftrast.stride_color;
1919                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1921                 }
1922                 else
1923                 {
1924                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1925                 }
1926                 break;
1927         default:
1928                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1930                 {
1931                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1933                         {
1934                         case 2:
1935                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1936                                 break;
1937                         case 3:
1938                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1939                                 break;
1940                         case 4:
1941                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                                 break;
1943                         }
1944                 }
1945                 break;
1946         }
1947         return outf;
1948 #else
1949         return NULL;
1950 #endif
1951 }
1952
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957         return data;
1958 }
1959
1960 #if 0
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1962 {
1963 #ifdef SSE_POSSIBLE
1964         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1966         return data;
1967 #else
1968         return NULL;
1969 #endif
1970 }
1971 #endif
1972
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975 #ifdef SSE_POSSIBLE
1976         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1978         return data;
1979 #else
1980         return NULL;
1981 #endif
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         float wslope = triangle->w[0];
1990         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991         float endz = 1.0f / (w + wslope * startx);
1992         for (x = startx;x < endx;)
1993         {
1994                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1995                 float z = endz, dz;
1996                 if (nextsub >= endx) nextsub = endsub = endx-1;
1997                 endz = 1.0f / (w + wslope * nextsub);
1998                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1999                 for (; x <= endsub; x++, z += dz)
2000                         zf[x] = z;
2001         }
2002 }
2003
2004 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2005 {
2006         int x;
2007         int startx = span->startx;
2008         int endx = span->endx;
2009         int d[4];
2010         float a, b;
2011         unsigned char * RESTRICT pixelmask = span->pixelmask;
2012         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2013         if (!pixel)
2014                 return;
2015         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2016         // handle alphatest now (this affects depth writes too)
2017         if (thread->alphatest)
2018                 for (x = startx;x < endx;x++)
2019                         if (in4f[x*4+3] < 0.5f)
2020                                 pixelmask[x] = false;
2021         // FIXME: this does not handle bigendian
2022         switch(thread->fb_blendmode)
2023         {
2024         case DPSOFTRAST_BLENDMODE_OPAQUE:
2025                 for (x = startx;x < endx;x++)
2026                 {
2027                         if (!pixelmask[x])
2028                                 continue;
2029                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2030                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2031                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2032                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2033                         pixel[x*4+0] = d[0];
2034                         pixel[x*4+1] = d[1];
2035                         pixel[x*4+2] = d[2];
2036                         pixel[x*4+3] = d[3];
2037                 }
2038                 break;
2039         case DPSOFTRAST_BLENDMODE_ALPHA:
2040                 for (x = startx;x < endx;x++)
2041                 {
2042                         if (!pixelmask[x])
2043                                 continue;
2044                         a = in4f[x*4+3] * 255.0f;
2045                         b = 1.0f - in4f[x*4+3];
2046                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2047                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2048                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2049                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2050                         pixel[x*4+0] = d[0];
2051                         pixel[x*4+1] = d[1];
2052                         pixel[x*4+2] = d[2];
2053                         pixel[x*4+3] = d[3];
2054                 }
2055                 break;
2056         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2057                 for (x = startx;x < endx;x++)
2058                 {
2059                         if (!pixelmask[x])
2060                                 continue;
2061                         a = in4f[x*4+3] * 255.0f;
2062                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066                         pixel[x*4+0] = d[0];
2067                         pixel[x*4+1] = d[1];
2068                         pixel[x*4+2] = d[2];
2069                         pixel[x*4+3] = d[3];
2070                 }
2071                 break;
2072         case DPSOFTRAST_BLENDMODE_ADD:
2073                 for (x = startx;x < endx;x++)
2074                 {
2075                         if (!pixelmask[x])
2076                                 continue;
2077                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081                         pixel[x*4+0] = d[0];
2082                         pixel[x*4+1] = d[1];
2083                         pixel[x*4+2] = d[2];
2084                         pixel[x*4+3] = d[3];
2085                 }
2086                 break;
2087         case DPSOFTRAST_BLENDMODE_INVMOD:
2088                 for (x = startx;x < endx;x++)
2089                 {
2090                         if (!pixelmask[x])
2091                                 continue;
2092                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2093                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2094                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2095                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2096                         pixel[x*4+0] = d[0];
2097                         pixel[x*4+1] = d[1];
2098                         pixel[x*4+2] = d[2];
2099                         pixel[x*4+3] = d[3];
2100                 }
2101                 break;
2102         case DPSOFTRAST_BLENDMODE_MUL:
2103                 for (x = startx;x < endx;x++)
2104                 {
2105                         if (!pixelmask[x])
2106                                 continue;
2107                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2108                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2109                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2110                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2111                         pixel[x*4+0] = d[0];
2112                         pixel[x*4+1] = d[1];
2113                         pixel[x*4+2] = d[2];
2114                         pixel[x*4+3] = d[3];
2115                 }
2116                 break;
2117         case DPSOFTRAST_BLENDMODE_MUL2:
2118                 for (x = startx;x < endx;x++)
2119                 {
2120                         if (!pixelmask[x])
2121                                 continue;
2122                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2123                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2124                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2125                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2126                         pixel[x*4+0] = d[0];
2127                         pixel[x*4+1] = d[1];
2128                         pixel[x*4+2] = d[2];
2129                         pixel[x*4+3] = d[3];
2130                 }
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133                 for (x = startx;x < endx;x++)
2134                 {
2135                         if (!pixelmask[x])
2136                                 continue;
2137                         a = in4f[x*4+3] * -255.0f;
2138                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2139                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2140                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2141                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2142                         pixel[x*4+0] = d[0];
2143                         pixel[x*4+1] = d[1];
2144                         pixel[x*4+2] = d[2];
2145                         pixel[x*4+3] = d[3];
2146                 }
2147                 break;
2148         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2149                 for (x = startx;x < endx;x++)
2150                 {
2151                         if (!pixelmask[x])
2152                                 continue;
2153                         a = 255.0f;
2154                         b = 1.0f - in4f[x*4+3];
2155                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2156                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2157                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2158                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2159                         pixel[x*4+0] = d[0];
2160                         pixel[x*4+1] = d[1];
2161                         pixel[x*4+2] = d[2];
2162                         pixel[x*4+3] = d[3];
2163                 }
2164                 break;
2165         case DPSOFTRAST_BLENDMODE_INVADD:
2166                 for (x = startx;x < endx;x++)
2167                 {
2168                         if (!pixelmask[x])
2169                                 continue;
2170                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2171                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2172                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2173                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2174                         pixel[x*4+0] = d[0];
2175                         pixel[x*4+1] = d[1];
2176                         pixel[x*4+2] = d[2];
2177                         pixel[x*4+3] = d[3];
2178                 }
2179                 break;
2180         }
2181 }
2182
2183 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2184 {
2185 #ifdef SSE_POSSIBLE
2186         int x;
2187         int startx = span->startx;
2188         int endx = span->endx;
2189         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2190         unsigned char * RESTRICT pixelmask = span->pixelmask;
2191         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2192         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2193         if (!pixel)
2194                 return;
2195         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2196         pixeli += span->y * dpsoftrast.fb_width + span->x;
2197         // handle alphatest now (this affects depth writes too)
2198         if (thread->alphatest)
2199                 for (x = startx;x < endx;x++)
2200                         if (in4ub[x*4+3] < 0.5f)
2201                                 pixelmask[x] = false;
2202         // FIXME: this does not handle bigendian
2203         switch(thread->fb_blendmode)
2204         {
2205         case DPSOFTRAST_BLENDMODE_OPAQUE:
2206                 for (x = startx;x + 4 <= endx;)
2207                 {
2208                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2209                         {
2210                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2211                                 x += 4;
2212                         }
2213                         else
2214                         {
2215                                 if (pixelmask[x])
2216                                         pixeli[x] = ini[x];
2217                                 x++;
2218                         }
2219                 }
2220                 for (;x < endx;x++)
2221                         if (pixelmask[x])
2222                                 pixeli[x] = ini[x];
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_ALPHA:
2225         #define FINISHBLEND(blend2, blend1) \
2226                 for (x = startx;x + 1 < endx;x += 2) \
2227                 { \
2228                         __m128i src, dst; \
2229                         switch (*(const unsigned short*)&pixelmask[x]) \
2230                         { \
2231                         case 0x0101: \
2232                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2234                                 blend2; \
2235                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2236                                 continue; \
2237                         case 0x0100: \
2238                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2239                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2240                                 blend1; \
2241                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2242                                 continue; \
2243                         case 0x0001: \
2244                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2245                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2246                                 blend1; \
2247                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2248                                 continue; \
2249                         } \
2250                         break; \
2251                 } \
2252                 for(;x < endx; x++) \
2253                 { \
2254                         __m128i src, dst; \
2255                         if (!pixelmask[x]) \
2256                                 continue; \
2257                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2258                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2259                         blend1; \
2260                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2261                 }
2262
2263                 FINISHBLEND({
2264                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2266                 }, {
2267                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2268                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2269                 });
2270                 break;
2271         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2272                 FINISHBLEND({
2273                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275                 }, {
2276                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2277                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2278                 });
2279                 break;
2280         case DPSOFTRAST_BLENDMODE_ADD:
2281                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2282                 break;
2283         case DPSOFTRAST_BLENDMODE_INVMOD:
2284                 FINISHBLEND({
2285                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2286                 }, {
2287                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2288                 });
2289                 break;
2290         case DPSOFTRAST_BLENDMODE_MUL:
2291                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2292                 break;
2293         case DPSOFTRAST_BLENDMODE_MUL2:
2294                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2295                 break;
2296         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2297                 FINISHBLEND({
2298                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2299                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2300                 }, {
2301                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2302                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2303                 });
2304                 break;
2305         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2306                 FINISHBLEND({
2307                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2308                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2309                 }, {
2310                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2311                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2312                 });
2313                 break;
2314         case DPSOFTRAST_BLENDMODE_INVADD:
2315                 FINISHBLEND({
2316                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2317                 }, {
2318                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2319                 });
2320                 break;
2321         }
2322 #endif
2323 }
2324
2325 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2326 {
2327         int x;
2328         int startx = span->startx;
2329         int endx = span->endx;
2330         int flags;
2331         float c[4];
2332         float data[4];
2333         float slope[4];
2334         float tc[2], endtc[2];
2335         float tcscale[2];
2336         unsigned int tci[2];
2337         unsigned int tci1[2];
2338         unsigned int tcimin[2];
2339         unsigned int tcimax[2];
2340         int tciwrapmask[2];
2341         int tciwidth;
2342         int filter;
2343         int mip;
2344         const unsigned char * RESTRICT pixelbase;
2345         const unsigned char * RESTRICT pixel[4];
2346         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2347         // if no texture is bound, just fill it with white
2348         if (!texture)
2349         {
2350                 for (x = startx;x < endx;x++)
2351                 {
2352                         out4f[x*4+0] = 1.0f;
2353                         out4f[x*4+1] = 1.0f;
2354                         out4f[x*4+2] = 1.0f;
2355                         out4f[x*4+3] = 1.0f;
2356                 }
2357                 return;
2358         }
2359         mip = triangle->mip[texunitindex];
2360         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2361         // if this mipmap of the texture is 1 pixel, just fill it with that color
2362         if (texture->mipmap[mip][1] == 4)
2363         {
2364                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2365                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2366                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2367                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2368                 for (x = startx;x < endx;x++)
2369                 {
2370                         out4f[x*4+0] = c[0];
2371                         out4f[x*4+1] = c[1];
2372                         out4f[x*4+2] = c[2];
2373                         out4f[x*4+3] = c[3];
2374                 }
2375                 return;
2376         }
2377         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2378         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2379         flags = texture->flags;
2380         tcscale[0] = texture->mipmap[mip][2];
2381         tcscale[1] = texture->mipmap[mip][3];
2382         tciwidth = texture->mipmap[mip][2];
2383         tcimin[0] = 0;
2384         tcimin[1] = 0;
2385         tcimax[0] = texture->mipmap[mip][2]-1;
2386         tcimax[1] = texture->mipmap[mip][3]-1;
2387         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2388         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2389         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2390         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2391         if (filter)
2392         {
2393                 endtc[0] -= 0.5f;
2394                 endtc[1] -= 0.5f;
2395         }
2396         for (x = startx;x < endx;)
2397         {
2398                 unsigned int subtc[2];
2399                 unsigned int substep[2];
2400                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2401                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2402                 if (nextsub >= endx)
2403                 {
2404                         nextsub = endsub = endx-1;      
2405                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2406                 }
2407                 tc[0] = endtc[0];
2408                 tc[1] = endtc[1];
2409                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2410                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2411                 if (filter)
2412                 {
2413                         endtc[0] -= 0.5f;
2414                         endtc[1] -= 0.5f;
2415                 }
2416                 substep[0] = (endtc[0] - tc[0]) * subscale;
2417                 substep[1] = (endtc[1] - tc[1]) * subscale;
2418                 subtc[0] = tc[0] * (1<<12);
2419                 subtc[1] = tc[1] * (1<<12);
2420                 if (filter)
2421                 {
2422                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2423                         {
2424                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2425                                 {
2426                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2427                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2428                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2429                                         tci[0] = subtc[0]>>12;
2430                                         tci[1] = subtc[1]>>12;
2431                                         tci1[0] = tci[0] + 1;
2432                                         tci1[1] = tci[1] + 1;
2433                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2434                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2435                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2436                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2437                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2438                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2439                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2440                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2441                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2442                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2443                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2444                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2445                                         out4f[x*4+0] = c[0];
2446                                         out4f[x*4+1] = c[1];
2447                                         out4f[x*4+2] = c[2];
2448                                         out4f[x*4+3] = c[3];
2449                                 }
2450                         }
2451                         else
2452                         {
2453                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2454                                 {
2455                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2456                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2457                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2458                                         tci[0] = subtc[0]>>12;
2459                                         tci[1] = subtc[1]>>12;
2460                                         tci1[0] = tci[0] + 1;
2461                                         tci1[1] = tci[1] + 1;
2462                                         tci[0] &= tciwrapmask[0];
2463                                         tci[1] &= tciwrapmask[1];
2464                                         tci1[0] &= tciwrapmask[0];
2465                                         tci1[1] &= tciwrapmask[1];
2466                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2467                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2468                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2469                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2470                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2471                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2472                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2473                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2474                                         out4f[x*4+0] = c[0];
2475                                         out4f[x*4+1] = c[1];
2476                                         out4f[x*4+2] = c[2];
2477                                         out4f[x*4+3] = c[3];
2478                                 }
2479                         }
2480                 }
2481                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2482                 {
2483                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2484                         {
2485                                 tci[0] = subtc[0]>>12;
2486                                 tci[1] = subtc[1]>>12;
2487                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2488                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2489                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2490                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2491                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2492                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2493                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2494                                 out4f[x*4+0] = c[0];
2495                                 out4f[x*4+1] = c[1];
2496                                 out4f[x*4+2] = c[2];
2497                                 out4f[x*4+3] = c[3];
2498                         }
2499                 }
2500                 else
2501                 {
2502                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2503                         {
2504                                 tci[0] = subtc[0]>>12;
2505                                 tci[1] = subtc[1]>>12;
2506                                 tci[0] &= tciwrapmask[0];
2507                                 tci[1] &= tciwrapmask[1];
2508                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2509                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2510                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2511                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2512                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2513                                 out4f[x*4+0] = c[0];
2514                                 out4f[x*4+1] = c[1];
2515                                 out4f[x*4+2] = c[2];
2516                                 out4f[x*4+3] = c[3];
2517                         }
2518                 }
2519         }
2520 }
2521
2522 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2523 {
2524 #ifdef SSE_POSSIBLE
2525         int x;
2526         int startx = span->startx;
2527         int endx = span->endx;
2528         int flags;
2529         __m128 data, slope, tcscale;
2530         __m128i tcsize, tcmask, tcoffset, tcmax;
2531         __m128 tc, endtc;
2532         __m128i subtc, substep, endsubtc;
2533         int filter;
2534         int mip;
2535         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2536         const unsigned char * RESTRICT pixelbase;
2537         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2538         // if no texture is bound, just fill it with white
2539         if (!texture)
2540         {
2541                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2542                 return;
2543         }
2544         mip = triangle->mip[texunitindex];
2545         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2546         // if this mipmap of the texture is 1 pixel, just fill it with that color
2547         if (texture->mipmap[mip][1] == 4)
2548         {
2549                 unsigned int k = *((const unsigned int *)pixelbase);
2550                 for (x = startx;x < endx;x++)
2551                         outi[x] = k;
2552                 return;
2553         }
2554         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2555         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2556         flags = texture->flags;
2557         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2558         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2559         tcscale = _mm_cvtepi32_ps(tcsize);
2560         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2561         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2562         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2563         if (filter)
2564                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2565         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2566         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2567         tcmax = _mm_packs_epi32(tcmask, tcmask);
2568         for (x = startx;x < endx;)
2569         {
2570                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2571                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2572                 if (nextsub >= endx)
2573                 {
2574                         nextsub = endsub = endx-1;
2575                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2576                 }       
2577                 tc = endtc;
2578                 subtc = endsubtc;
2579                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2580                 if (filter)
2581                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2582                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2583                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2584                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2585                 substep = _mm_slli_epi32(substep, 1);
2586                 if (filter)
2587                 {
2588                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2589                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2590                         {
2591                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2592                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2593                                 {
2594                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2595                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2596                                         tci = _mm_madd_epi16(tci, tcoffset);
2597                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2598                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2599                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2600                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2601                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2602                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2603                                         fracm = _mm_srli_epi16(subtc, 1);
2604                                         pix1 = _mm_add_epi16(pix1,
2605                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2606                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2607                                         pix3 = _mm_add_epi16(pix3,
2608                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2609                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2610                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2611                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2612                                         pix2 = _mm_add_epi16(pix2,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2614                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2615                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2616                                 }
2617                                 if (x <= endsub)
2618                                 {
2619                                         const unsigned char * RESTRICT ptr1;
2620                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2621                                         tci = _mm_madd_epi16(tci, tcoffset);
2622                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2623                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2624                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2625                                         fracm = _mm_srli_epi16(subtc, 1);
2626                                         pix1 = _mm_add_epi16(pix1,
2627                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2628                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2629                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2630                                         pix1 = _mm_add_epi16(pix1,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2633                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2634                                         x++;
2635                                 }
2636                         }
2637                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2638                         {
2639                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2640                                 {
2641                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2642                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2643                                         tci = _mm_madd_epi16(tci, tcoffset);
2644                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2645                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2646                                                                                         _mm_setzero_si128());
2647                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2648                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2649                                                                                         _mm_setzero_si128());
2650                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2651                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2652                                         tci = _mm_madd_epi16(tci, tcoffset);
2653                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2657                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2658                                                                                         _mm_setzero_si128());
2659                                         fracm = _mm_srli_epi16(subtc, 1);
2660                                         pix1 = _mm_add_epi16(pix1,
2661                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2663                                         pix3 = _mm_add_epi16(pix3,
2664                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2665                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2666                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2667                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2668                                         pix2 = _mm_add_epi16(pix2,
2669                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2670                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2671                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2672                                 }
2673                                 if (x <= endsub)
2674                                 {
2675                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2676                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2679                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2680                                                                                         _mm_setzero_si128());
2681                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2683                                                                                         _mm_setzero_si128());
2684                                         fracm = _mm_srli_epi16(subtc, 1);
2685                                         pix1 = _mm_add_epi16(pix1,
2686                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2687                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2688                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2689                                         pix1 = _mm_add_epi16(pix1,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2692                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2693                                         x++;
2694                                 }
2695                         }
2696                         else
2697                         {
2698                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2699                                 {
2700                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2701                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2702                                         tci = _mm_madd_epi16(tci, tcoffset);
2703                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2704                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2705                                                                                         _mm_setzero_si128());
2706                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2707                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2708                                                                                         _mm_setzero_si128());
2709                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2710                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2713                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2714                                                                                         _mm_setzero_si128());
2715                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2716                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2717                                                                                         _mm_setzero_si128());
2718                                         fracm = _mm_srli_epi16(subtc, 1);
2719                                         pix1 = _mm_add_epi16(pix1,
2720                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2721                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2722                                         pix3 = _mm_add_epi16(pix3,
2723                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2724                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2725                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2726                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2727                                         pix2 = _mm_add_epi16(pix2,
2728                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2729                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2730                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2731                                 }
2732                                 if (x <= endsub)
2733                                 {
2734                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2735                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2736                                         tci = _mm_madd_epi16(tci, tcoffset);
2737                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2738                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2739                                                                                         _mm_setzero_si128());
2740                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         fracm = _mm_srli_epi16(subtc, 1);
2744                                         pix1 = _mm_add_epi16(pix1,
2745                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2746                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2747                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2748                                         pix1 = _mm_add_epi16(pix1,
2749                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2750                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2751                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2752                                         x++;
2753                                 }
2754                         }
2755                 }
2756                 else
2757                 {
2758                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2759                         {
2760                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2761                                 {
2762                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2763                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2764                                         tci = _mm_madd_epi16(tci, tcoffset);
2765                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2766                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2767                                 }
2768                                 if (x <= endsub)
2769                                 {
2770                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2771                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2772                                         tci = _mm_madd_epi16(tci, tcoffset);
2773                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2774                                         x++;
2775                                 }
2776                         }
2777                         else
2778                         {
2779                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2780                                 {
2781                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2782                                         tci = _mm_and_si128(tci, tcmax); 
2783                                         tci = _mm_madd_epi16(tci, tcoffset);
2784                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2785                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2786                                 }
2787                                 if (x <= endsub)
2788                                 {
2789                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2790                                         tci = _mm_and_si128(tci, tcmax); 
2791                                         tci = _mm_madd_epi16(tci, tcoffset);
2792                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2793                                         x++;
2794                                 }
2795                         }
2796                 }
2797         }
2798 #endif
2799 }
2800
2801 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2802 {
2803         // TODO: IMPLEMENT
2804         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2805 }
2806
2807 float DPSOFTRAST_SampleShadowmap(const float *vector)
2808 {
2809         // TODO: IMPLEMENT
2810         return 1.0f;
2811 }
2812
2813 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2814 {
2815         int x;
2816         int startx = span->startx;
2817         int endx = span->endx;
2818         float c[4];
2819         float data[4];
2820         float slope[4];
2821         float z;
2822         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2823         for (x = startx;x < endx;x++)
2824         {
2825                 z = zf[x];
2826                 c[0] = (data[0] + slope[0]*x) * z;
2827                 c[1] = (data[1] + slope[1]*x) * z;
2828                 c[2] = (data[2] + slope[2]*x) * z;
2829                 c[3] = (data[3] + slope[3]*x) * z;
2830                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2831                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2832                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2833                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2834         }
2835 }
2836
2837 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2838 {
2839         int x;
2840         int startx = span->startx;
2841         int endx = span->endx;
2842         float c[4];
2843         float data[4];
2844         float slope[4];
2845         float z;
2846         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2847         for (x = startx;x < endx;x++)
2848         {
2849                 z = zf[x];
2850                 c[0] = (data[0] + slope[0]*x) * z;
2851                 c[1] = (data[1] + slope[1]*x) * z;
2852                 c[2] = (data[2] + slope[2]*x) * z;
2853                 c[3] = (data[3] + slope[3]*x) * z;
2854                 out4f[x*4+0] = c[0];
2855                 out4f[x*4+1] = c[1];
2856                 out4f[x*4+2] = c[2];
2857                 out4f[x*4+3] = c[3];
2858         }
2859 }
2860
2861 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2862 {
2863         int x, startx = span->startx, endx = span->endx;
2864         float c[4], localcolor[4];
2865         localcolor[0] = subcolor[0];
2866         localcolor[1] = subcolor[1];
2867         localcolor[2] = subcolor[2];
2868         localcolor[3] = subcolor[3];
2869         for (x = startx;x < endx;x++)
2870         {
2871                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2872                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2873                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2874                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2875                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2876                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2877                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2878                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2879         }
2880 }
2881
2882 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2883 {
2884         int x, startx = span->startx, endx = span->endx;
2885         for (x = startx;x < endx;x++)
2886         {
2887                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2888                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2889                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2890                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2891         }
2892 }
2893
2894 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2895 {
2896         int x, startx = span->startx, endx = span->endx;
2897         for (x = startx;x < endx;x++)
2898         {
2899                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2900                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2901                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2902                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2903         }
2904 }
2905
2906 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2907 {
2908         int x, startx = span->startx, endx = span->endx;
2909         float a, b;
2910         for (x = startx;x < endx;x++)
2911         {
2912                 a = 1.0f - inb4f[x*4+3];
2913                 b = inb4f[x*4+3];
2914                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2915                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2916                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2917                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2918         }
2919 }
2920
2921 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2922 {
2923         int x, startx = span->startx, endx = span->endx;
2924         float localcolor[4], ilerp, lerp;
2925         localcolor[0] = color[0];
2926         localcolor[1] = color[1];
2927         localcolor[2] = color[2];
2928         localcolor[3] = color[3];
2929         ilerp = 1.0f - localcolor[3];
2930         lerp = localcolor[3];
2931         for (x = startx;x < endx;x++)
2932         {
2933                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2934                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2935                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2936                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2937         }
2938 }
2939
2940
2941
2942 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2943 {
2944 #ifdef SSE_POSSIBLE
2945         int x;
2946         int startx = span->startx;
2947         int endx = span->endx;
2948         __m128 data, slope;
2949         __m128 mod, endmod;
2950         __m128i submod, substep, endsubmod;
2951         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2952         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2953         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2954         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2955         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2956         for (x = startx; x < endx;)
2957         {
2958                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2959                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2960                 if (nextsub >= endx)
2961                 {
2962                         nextsub = endsub = endx-1;
2963                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2964                 }
2965                 mod = endmod;
2966                 submod = endsubmod;
2967                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2968                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2969                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2970                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2971                 substep = _mm_packs_epi32(substep, substep);
2972                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2973                 {
2974                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2975                         pix = _mm_mulhi_epu16(pix, submod);
2976                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2977                 }
2978                 if (x <= endsub)
2979                 {
2980                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2981                         pix = _mm_mulhi_epu16(pix, submod);
2982                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2983                         x++;
2984                 }
2985         }
2986 #endif
2987 }
2988
2989 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2990 {
2991 #ifdef SSE_POSSIBLE
2992         int x;
2993         int startx = span->startx;
2994         int endx = span->endx;
2995         __m128 data, slope;
2996         __m128 mod, endmod;
2997         __m128i submod, substep, endsubmod;
2998         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2999         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3000         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3001         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3002         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3003         for (x = startx; x < endx;)
3004         {
3005                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3006                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3007                 if (nextsub >= endx)
3008                 {
3009                         nextsub = endsub = endx-1;
3010                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3011                 }
3012                 mod = endmod;
3013                 submod = endsubmod;
3014                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3015                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3016                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3017                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3018                 substep = _mm_packs_epi32(substep, substep);
3019                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3020                 {
3021                         __m128i pix = _mm_srai_epi16(submod, 4);
3022                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3023                 }
3024                 if (x <= endsub)
3025                 {
3026                         __m128i pix = _mm_srai_epi16(submod, 4);
3027                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3028                         x++;
3029                 }
3030         }
3031 #endif
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3035 {
3036 #ifdef SSE_POSSIBLE
3037         int x, startx = span->startx, endx = span->endx;
3038         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3039         localcolor = _mm_packs_epi32(localcolor, localcolor);
3040         for (x = startx;x+2 <= endx;x+=2)
3041         {
3042                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3043                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3044                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3045                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3046         }
3047         if (x < endx)
3048         {
3049                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3050                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3051                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3052                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3053         }
3054 #endif
3055 }
3056
3057 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3058 {
3059 #ifdef SSE_POSSIBLE
3060         int x, startx = span->startx, endx = span->endx;
3061         for (x = startx;x+2 <= endx;x+=2)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3066                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3067         }
3068         if (x < endx)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3073                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3074         }
3075 #endif
3076 }
3077
3078 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3079 {
3080 #ifdef SSE_POSSIBLE
3081         int x, startx = span->startx, endx = span->endx;
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086                 pix1 = _mm_add_epi16(pix1, pix2);
3087                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3088         }
3089         if (x < endx)
3090         {
3091                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3093                 pix1 = _mm_add_epi16(pix1, pix2);
3094                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3095         }
3096 #endif
3097 }
3098
3099 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3100 {
3101 #ifdef SSE_POSSIBLE
3102         int x, startx = span->startx, endx = span->endx;
3103         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3104         tint = _mm_packs_epi32(tint, tint);
3105         for (x = startx;x+2 <= endx;x+=2)
3106         {
3107                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3108                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3109                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3110                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3111         }
3112         if (x < endx)
3113         {
3114                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3115                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3116                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3117                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3118         }
3119 #endif
3120 }
3121
3122 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3123 {
3124 #ifdef SSE_POSSIBLE
3125         int x, startx = span->startx, endx = span->endx;
3126         for (x = startx;x+2 <= endx;x+=2)
3127         {
3128                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3129                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3130                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3131                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3132                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3133         }
3134         if (x < endx)
3135         {
3136                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3137                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3138                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3139                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3140                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3141         }
3142 #endif
3143 }
3144
3145 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3146 {
3147 #ifdef SSE_POSSIBLE
3148         int x, startx = span->startx, endx = span->endx;
3149         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3150         localcolor = _mm_packs_epi32(localcolor, localcolor);
3151         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3152         for (x = startx;x+2 <= endx;x+=2)
3153         {
3154                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3155                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3156                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3157         }
3158         if (x < endx)
3159         {
3160                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3161                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3162                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3163         }
3164 #endif
3165 }
3166
3167
3168
3169 void DPSOFTRAST_VertexShader_Generic(void)
3170 {
3171         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3172         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3173         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3174         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3175                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3176 }
3177
3178 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3179 {
3180         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3181         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3182         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3183         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3185         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3186         {
3187                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3188                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3189                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3190                 {
3191                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3192                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3193                         {
3194                                 // multiply
3195                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3196                         }
3197                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3198                         {
3199                                 // add
3200                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3201                         }
3202                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3203                         {
3204                                 // alphablend
3205                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3206                         }
3207                 }
3208         }
3209         else
3210                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3211         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3212 }
3213
3214
3215
3216 void DPSOFTRAST_VertexShader_PostProcess(void)
3217 {
3218         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3220         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3221 }
3222
3223 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3224 {
3225         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3226         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3231         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3232         {
3233                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3234                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3235         }
3236         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3237         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3238         {
3239                 // TODO: implement saturation
3240         }
3241         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3242         {
3243                 // TODO: implement gammaramps
3244         }
3245         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3246 }
3247
3248
3249
3250 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3251 {
3252         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3253 }
3254
3255 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3256 {
3257         // this is never called (because colormask is off when this shader is used)
3258         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3259         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3260         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3261         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3262         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3263 }
3264
3265
3266
3267 void DPSOFTRAST_VertexShader_FlatColor(void)
3268 {
3269         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3271 }
3272
3273 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 {
3275 #ifdef SSE_POSSIBLE
3276         unsigned char * RESTRICT pixelmask = span->pixelmask;
3277         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278         int x, startx = span->startx, endx = span->endx;
3279         __m128i Color_Ambientm;
3280         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3281         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286                 pixel = buffer_FragColorbgra8;
3287         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291         for (x = startx;x < endx;x++)
3292         {
3293                 __m128i color, pix;
3294                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3295                 {
3296                         __m128i pix2;
3297                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3298                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3299                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3300                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3301                         x += 3;
3302                         continue;
3303                 }
3304                 if (!pixelmask[x])
3305                         continue;
3306                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3307                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3308                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3309         }
3310         if (pixel == buffer_FragColorbgra8)
3311                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3312 #endif
3313 }
3314
3315
3316
3317 void DPSOFTRAST_VertexShader_VertexColor(void)
3318 {
3319         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3320         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3321         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3322 }
3323
3324 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3325 {
3326 #ifdef SSE_POSSIBLE
3327         unsigned char * RESTRICT pixelmask = span->pixelmask;
3328         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3329         int x, startx = span->startx, endx = span->endx;
3330         __m128i Color_Ambientm, Color_Diffusem;
3331         __m128 data, slope;
3332         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3333         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3335         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3336         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3337         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3338         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3339                 pixel = buffer_FragColorbgra8;
3340         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3341         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3342         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3343         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3344         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3345         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3346         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3347         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3348         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3349         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3350         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3351         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3352         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3353         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3354         {
3355                 __m128i color, mod, pix;
3356                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3357                 {
3358                         __m128i pix2, mod2;
3359                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3360                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3361                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3362                         data = _mm_add_ps(data, slope);
3363                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3364                         data = _mm_add_ps(data, slope);
3365                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3366                         data = _mm_add_ps(data, slope);
3367                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3368                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3369                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3370                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3371                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3372                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3373                         x += 3;
3374                         continue;
3375                 }
3376                 if (!pixelmask[x])
3377                         continue;
3378                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3379                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3380                 mod = _mm_packs_epi32(mod, mod);
3381                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3382                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3383         }
3384         if (pixel == buffer_FragColorbgra8)
3385                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3386 #endif
3387 }
3388
3389
3390
3391 void DPSOFTRAST_VertexShader_Lightmap(void)
3392 {
3393         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3394         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3395         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3396 }
3397
3398 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3399 {
3400 #ifdef SSE_POSSIBLE
3401         unsigned char * RESTRICT pixelmask = span->pixelmask;
3402         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3403         int x, startx = span->startx, endx = span->endx;
3404         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3405         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3406         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3407         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3408         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3410         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3411         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3412         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3413         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3414                 pixel = buffer_FragColorbgra8;
3415         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3416         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3417         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3418         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3419         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3420         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3421         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3422         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3423         {
3424                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3425                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3426                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3427                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3428                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3429                 for (x = startx;x < endx;x++)
3430                 {
3431                         __m128i color, lightmap, glow, pix;
3432                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3433                         {
3434                                 __m128i pix2;
3435                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3436                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3437                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3438                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3439                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3440                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3441                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3442                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3443                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3444                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3445                                 x += 3;
3446                                 continue;
3447                         }
3448                         if (!pixelmask[x])
3449                                 continue;
3450                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3451                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3452                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3453                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3454                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3455                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3456                 }
3457         }
3458         else
3459         {
3460                 for (x = startx;x < endx;x++)
3461                 {
3462                         __m128i color, lightmap, pix;
3463                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3464                         {
3465                                 __m128i pix2;
3466                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3467                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3468                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3469                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3470                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3471                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3472                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3473                                 x += 3;
3474                                 continue;
3475                         }
3476                         if (!pixelmask[x]) 
3477                                 continue;
3478                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3479                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3480                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3481                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3482                 }
3483         }
3484         if (pixel == buffer_FragColorbgra8)
3485                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3486 #endif
3487 }
3488
3489
3490 void DPSOFTRAST_VertexShader_LightDirection(void);
3491 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3492
3493 void DPSOFTRAST_VertexShader_FakeLight(void)
3494 {
3495         DPSOFTRAST_VertexShader_LightDirection();
3496 }
3497
3498 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3499 {
3500         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3501 }
3502
3503
3504
3505 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3506 {
3507         DPSOFTRAST_VertexShader_LightDirection();
3508         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3509 }
3510
3511 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3512 {
3513         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3514 }
3515
3516
3517
3518 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3519 {
3520         DPSOFTRAST_VertexShader_LightDirection();
3521         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3522 }
3523
3524 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3525 {
3526         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3527 }
3528
3529
3530
3531 void DPSOFTRAST_VertexShader_LightDirection(void)
3532 {
3533         int i;
3534         int numvertices = dpsoftrast.numvertices;
3535         float LightDir[4];
3536         float LightVector[4];
3537         float EyePosition[4];
3538         float EyeVectorModelSpace[4];
3539         float EyeVector[4];
3540         float position[4];
3541         float svector[4];
3542         float tvector[4];
3543         float normal[4];
3544         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3545         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3546         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3547         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3548         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3549         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3550         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3551         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3552         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3553         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3554         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3555         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3556         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3557         for (i = 0;i < numvertices;i++)
3558         {
3559                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3560                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3561                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3562                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3563                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3564                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3565                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3566                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3567                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3568                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3569                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3570                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3571                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3572                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3573                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3574                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3575                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3576                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3577                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3578                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3579                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3580                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3581                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3582                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3583                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3584                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3585                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3586                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3587                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3588         }
3589         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3590 }
3591
3592 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3593 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3594 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3595 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3596 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3597 #define DPSOFTRAST_Vector3Normalize(v)\
3598 do\
3599 {\
3600         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3601         if (len)\
3602         {\
3603                 len = 1.0f / len;\
3604                 v[0] *= len;\
3605                 v[1] *= len;\
3606                 v[2] *= len;\
3607         }\
3608 }\
3609 while(0)
3610
3611 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3612 {
3613         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3614         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3615         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3616         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3617         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3618         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3619         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3620         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3621         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3622         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3623         int x, startx = span->startx, endx = span->endx;
3624         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3625         float LightVectordata[4];
3626         float LightVectorslope[4];
3627         float EyeVectordata[4];
3628         float EyeVectorslope[4];
3629         float VectorSdata[4];
3630         float VectorSslope[4];
3631         float VectorTdata[4];
3632         float VectorTslope[4];
3633         float VectorRdata[4];
3634         float VectorRslope[4];
3635         float z;
3636         float diffusetex[4];
3637         float glosstex[4];
3638         float surfacenormal[4];
3639         float lightnormal[4];
3640         float lightnormal_modelspace[4];
3641         float eyenormal[4];
3642         float specularnormal[4];
3643         float diffuse;
3644         float specular;
3645         float SpecularPower;
3646         int d[4];
3647         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3648         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3649         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3650         Color_Glow[3] = 0.0f;
3651         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3652         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3653         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3654         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3655         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3656         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3657         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3658         Color_Pants[3] = 0.0f;
3659         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3660         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3661         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3662         Color_Shirt[3] = 0.0f;
3663         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3664         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3665         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3666         {
3667                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3668                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3669         }
3670         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3671         {
3672                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3673         }
3674         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3675         {
3676                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3677                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3678                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3679                 Color_Diffuse[3] = 0.0f;
3680                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3681                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3682                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3683                 LightColor[3] = 0.0f;
3684                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3685                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3686                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3687                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3688                 Color_Specular[3] = 0.0f;
3689                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3690                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3691                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692
3693                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3694                 {
3695                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3696                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3697                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3698                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3699                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3700                 }
3701                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3702                 {
3703                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3704                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3705                 }
3706                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3707                 {
3708                         // nothing of this needed
3709                 }
3710                 else
3711                 {
3712                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3713                 }
3714
3715                 for (x = startx;x < endx;x++)
3716                 {
3717                         z = buffer_z[x];
3718                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3719                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3720                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3721                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3722                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3723                         {
3724                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3725                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3726                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3727                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3728                         }
3729                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3730                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3731                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3732                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3733                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3734                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3735                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3736                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3737
3738                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3739                         {
3740                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3741                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3742                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3743                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3744
3745                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3746                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3747                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3748                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3749
3750                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3751                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3752                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3753                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3754
3755                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3756                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3757                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3758                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3759
3760                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3761                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3762
3763                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3764                                 {
3765                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3766                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3767                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3768                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3769                                 }
3770                         }
3771                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3772                         {
3773                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3774                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3775                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3776                                 {
3777                                         float f = 1.0f / 256.0f;
3778                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3779                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3780                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3781                                 }
3782                         }
3783                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3784                         {
3785                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3786                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3787                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3788                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3789
3790                                 LightColor[0] = 1.0;
3791                                 LightColor[1] = 1.0;
3792                                 LightColor[2] = 1.0;
3793                         }
3794                         else
3795                         {
3796                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3797                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3798                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3799                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3800                         }
3801
3802                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3803
3804                         if(thread->shader_exactspecularmath)
3805                         {
3806                                 // reflect lightnormal at surfacenormal, take the negative of that
3807                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3808                                 float f;
3809                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3810                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3811                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3812                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3813
3814                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3815                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3816                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3817                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3818                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3819
3820                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3821                         }
3822                         else
3823                         {
3824                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3825                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3826                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3827                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3828
3829                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3830                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3831                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3832                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3833
3834                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3835                         }
3836
3837                         specular = pow(specular, SpecularPower * glosstex[3]);
3838                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3839                         {
3840                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3841                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3842                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3843                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3844                         }
3845                         else
3846                         {
3847                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3848                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3849                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3850                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3851                         }
3852
3853                         buffer_FragColorbgra8[x*4+0] = d[0];
3854                         buffer_FragColorbgra8[x*4+1] = d[1];
3855                         buffer_FragColorbgra8[x*4+2] = d[2];
3856                         buffer_FragColorbgra8[x*4+3] = d[3];
3857                 }
3858         }
3859         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3860         {
3861                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3862                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3863                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3864                 Color_Diffuse[3] = 0.0f;
3865                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3866                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3867                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3868                 LightColor[3] = 0.0f;
3869                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3870
3871                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3872                 {
3873                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3874                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3875                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3876                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3877                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3878                 }
3879                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3880                 {
3881                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3882                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3883                 }
3884                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3885                 {
3886                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3887                 }
3888                 else
3889                 {
3890                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3891                 }
3892
3893                 for (x = startx;x < endx;x++)
3894                 {
3895                         z = buffer_z[x];
3896                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3897                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3898                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3899                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3900                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3901                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3902                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3903                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3904
3905                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3906                         {
3907                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3908                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3909                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3910                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3911
3912                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3913                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3914                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3915                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3916
3917                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3918                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3919                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3920                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3921
3922                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3923                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3924                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3925                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3926
3927                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3928                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3929
3930                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3931                                 {
3932                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3933                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3934                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3935                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3936                                 }
3937                         }
3938                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3939                         {
3940                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3941                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3942                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3943                                 {
3944                                         float f = 1.0f / 256.0f;
3945                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3946                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3947                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3948                                 }
3949                         }
3950                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3951                         {
3952                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3953                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3954                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3955                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3956
3957                                 LightColor[0] = 1.0;
3958                                 LightColor[1] = 1.0;
3959                                 LightColor[2] = 1.0;
3960                         }
3961                         else
3962                         {
3963                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3964                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3965                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3966                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3967                         }
3968
3969                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3970                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3971                         {
3972                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3973                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3974                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3975                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3976                         }
3977                         else
3978                         {
3979                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3980                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3981                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3982                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3983                         }
3984                         buffer_FragColorbgra8[x*4+0] = d[0];
3985                         buffer_FragColorbgra8[x*4+1] = d[1];
3986                         buffer_FragColorbgra8[x*4+2] = d[2];
3987                         buffer_FragColorbgra8[x*4+3] = d[3];
3988                 }
3989         }
3990         else
3991         {
3992                 for (x = startx;x < endx;x++)
3993                 {
3994                         z = buffer_z[x];
3995                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3996                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3997                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3998                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3999
4000                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4001                         {
4002                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4003                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4004                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4005                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4006                         }
4007                         else
4008                         {
4009                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4010                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4011                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4012                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4013                         }
4014                         buffer_FragColorbgra8[x*4+0] = d[0];
4015                         buffer_FragColorbgra8[x*4+1] = d[1];
4016                         buffer_FragColorbgra8[x*4+2] = d[2];
4017                         buffer_FragColorbgra8[x*4+3] = d[3];
4018                 }
4019         }
4020         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4021 }
4022
4023
4024
4025 void DPSOFTRAST_VertexShader_LightSource(void)
4026 {
4027         int i;
4028         int numvertices = dpsoftrast.numvertices;
4029         float LightPosition[4];
4030         float LightVector[4];
4031         float LightVectorModelSpace[4];
4032         float EyePosition[4];
4033         float EyeVectorModelSpace[4];
4034         float EyeVector[4];
4035         float position[4];
4036         float svector[4];
4037         float tvector[4];
4038         float normal[4];
4039         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4040         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4041         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4042         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4043         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4044         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4045         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4046         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4047         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4048         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4049         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4050         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4051         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4052         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4053         for (i = 0;i < numvertices;i++)
4054         {
4055                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4056                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4057                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4058                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4059                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4060                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4061                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4062                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4063                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4064                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4065                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4066                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4067                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4068                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4069                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4070                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4071                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4072                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4073                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4074                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4075                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4076                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4077                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4078                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4079                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4080                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4081                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4082                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4083                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4084                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4085                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4086                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4087         }
4088         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4089         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4090 }
4091
4092 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4093 {
4094 #ifdef SSE_POSSIBLE
4095         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4096         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4097         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4098         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4099         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4100         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4101         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4102         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4103         int x, startx = span->startx, endx = span->endx;
4104         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4105         float CubeVectordata[4];
4106         float CubeVectorslope[4];
4107         float LightVectordata[4];
4108         float LightVectorslope[4];
4109         float EyeVectordata[4];
4110         float EyeVectorslope[4];
4111         float z;
4112         float diffusetex[4];
4113         float glosstex[4];
4114         float surfacenormal[4];
4115         float lightnormal[4];
4116         float eyenormal[4];
4117         float specularnormal[4];
4118         float diffuse;
4119         float specular;
4120         float SpecularPower;
4121         float CubeVector[4];
4122         float attenuation;
4123         int d[4];
4124         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4125         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4126         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4127         Color_Glow[3] = 0.0f;
4128         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4129         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4130         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4131         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4132         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4133         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4134         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4135         Color_Diffuse[3] = 0.0f;
4136         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4137         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4138         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4139         Color_Specular[3] = 0.0f;
4140         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4141         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4142         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4143         Color_Pants[3] = 0.0f;
4144         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4145         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4146         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4147         Color_Shirt[3] = 0.0f;
4148         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4149         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4150         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4151         LightColor[3] = 0.0f;
4152         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4153         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4154         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4155         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4156         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4157         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4158         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4159         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4160         {
4161                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4162                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4163         }
4164         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4165                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4166         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4167         {
4168                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4169                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4170                 for (x = startx;x < endx;x++)
4171                 {
4172                         z = buffer_z[x];
4173                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4174                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4175                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4176                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4177                         if (attenuation < 0.01f)
4178                                 continue;
4179                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4180                         {
4181                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4182                                 if (attenuation < 0.01f)
4183                                         continue;
4184                         }
4185
4186                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4187                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4188                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4189                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4190                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4191                         {
4192                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4193                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4194                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4195                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4196                         }
4197                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4198                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4199                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4200                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4201                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4202                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4203                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4204                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4205
4206                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4207                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4208                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4209                         DPSOFTRAST_Vector3Normalize(lightnormal);
4210
4211                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4212
4213                         if(thread->shader_exactspecularmath)
4214                         {
4215                                 // reflect lightnormal at surfacenormal, take the negative of that
4216                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4217                                 float f;
4218                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4219                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4220                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4221                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4222
4223                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4224                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4225                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4226                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4227                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4228
4229                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4230                         }
4231                         else
4232                         {
4233                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4234                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4235                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4236                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4237
4238                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4239                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4240                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4241                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4242
4243                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4244                         }
4245                         specular = pow(specular, SpecularPower * glosstex[3]);
4246
4247                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4248                         {
4249                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4250                                 attenuation *= (1.0f / 255.0f);
4251                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4252                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4253                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4254                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4255                         }
4256                         else
4257                         {
4258                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4259                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4260                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4261                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4262                         }
4263                         buffer_FragColorbgra8[x*4+0] = d[0];
4264                         buffer_FragColorbgra8[x*4+1] = d[1];
4265                         buffer_FragColorbgra8[x*4+2] = d[2];
4266                         buffer_FragColorbgra8[x*4+3] = d[3];
4267                 }
4268         }
4269         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4270         {
4271                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4272                 for (x = startx;x < endx;x++)
4273                 {
4274                         z = buffer_z[x];
4275                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4276                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4277                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4278                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4279                         if (attenuation < 0.01f)
4280                                 continue;
4281                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4282                         {
4283                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4284                                 if (attenuation < 0.01f)
4285                                         continue;
4286                         }
4287
4288                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4289                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4290                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4291                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4292                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4293                         {
4294                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4295                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4296                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4297                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4298                         }
4299                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4300                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4301                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4302                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4303
4304                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4305                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4306                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4307                         DPSOFTRAST_Vector3Normalize(lightnormal);
4308
4309                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4310                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4311                         {
4312                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4313                                 attenuation *= (1.0f / 255.0f);
4314                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4315                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4316                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4317                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4318                         }
4319                         else
4320                         {
4321                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4322                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4323                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4324                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4325                         }
4326                         buffer_FragColorbgra8[x*4+0] = d[0];
4327                         buffer_FragColorbgra8[x*4+1] = d[1];
4328                         buffer_FragColorbgra8[x*4+2] = d[2];
4329                         buffer_FragColorbgra8[x*4+3] = d[3];
4330                 }
4331         }
4332         else
4333         {
4334                 for (x = startx;x < endx;x++)
4335                 {
4336                         z = buffer_z[x];
4337                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4338                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4339                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4340                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4341                         if (attenuation < 0.01f)
4342                                 continue;
4343                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4344                         {
4345                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4346                                 if (attenuation < 0.01f)
4347                                         continue;
4348                         }
4349
4350                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4351                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4352                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4353                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4354                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4355                         {
4356                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4357                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4358                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4359                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4360                         }
4361                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4362                         {
4363                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4364                                 attenuation *= (1.0f / 255.0f);
4365                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4366                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4367                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4368                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4369                         }
4370                         else
4371                         {
4372                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4373                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4374                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4375                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4376                         }
4377                         buffer_FragColorbgra8[x*4+0] = d[0];
4378                         buffer_FragColorbgra8[x*4+1] = d[1];
4379                         buffer_FragColorbgra8[x*4+2] = d[2];
4380                         buffer_FragColorbgra8[x*4+3] = d[3];
4381                 }
4382         }
4383         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4384 #endif
4385 }
4386
4387
4388
4389 void DPSOFTRAST_VertexShader_Refraction(void)
4390 {
4391         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4392         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4393         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4394 }
4395
4396 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4397 {
4398         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4399
4400         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4401         float z;
4402         int x, startx = span->startx, endx = span->endx;
4403
4404         // texture reads
4405         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4406         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4407
4408         // varyings
4409         float ModelViewProjectionPositiondata[4];
4410         float ModelViewProjectionPositionslope[4];
4411
4412         // uniforms
4413         float ScreenScaleRefractReflect[2];
4414         float ScreenCenterRefractReflect[2];
4415         float DistortScaleRefractReflect[2];
4416         float RefractColor[4];
4417
4418         const unsigned char * RESTRICT pixelbase;
4419         const unsigned char * RESTRICT pixel[4];
4420         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4421         if(!texture) return;
4422         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4423
4424         // read textures
4425         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4426         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4427
4428         // read varyings
4429         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4430
4431         // read uniforms
4432         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4433         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4434         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4435         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4436         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4437         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4438         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4439         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4440         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4441         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4442
4443         // do stuff
4444         for (x = startx;x < endx;x++)
4445         {
4446                 float SafeScreenTexCoord[2];
4447                 float ScreenTexCoord[2];
4448                 float v[3];
4449                 float iw;
4450                 unsigned char c[4];
4451
4452                 z = buffer_z[x];
4453
4454                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4455                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4456         
4457                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4458                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4459                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4460
4461                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4462                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4463                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4464                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4465                 DPSOFTRAST_Vector3Normalize(v);
4466                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4467                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4468
4469                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4470                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4471                 {
4472                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4473                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4474                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4475                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4476                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4477                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4478                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4479                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4480                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4481                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4482                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4483                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4484                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4485                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4486                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4487                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4488                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4489                 }
4490                 else
4491                 {
4492                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4493                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4494                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4495                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4496                         c[0] = pixel[0][0];
4497                         c[1] = pixel[0][1];
4498                         c[2] = pixel[0][2];
4499                 }
4500
4501                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4502                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4503                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4504                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4505                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4506         }
4507
4508         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4509 }
4510
4511
4512
4513 void DPSOFTRAST_VertexShader_Water(void)
4514 {
4515         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4516 }
4517
4518
4519 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4520 {
4521         // TODO: IMPLEMENT
4522         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4523         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4524         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4525         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4526         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4527 }
4528
4529
4530
4531 void DPSOFTRAST_VertexShader_ShowDepth(void)
4532 {
4533         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4534 }
4535
4536 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4537 {
4538         // TODO: IMPLEMENT
4539         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4540         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4541         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4542         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4543         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4544 }
4545
4546
4547
4548 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4549 {
4550         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4551 }
4552
4553 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4554 {
4555         // TODO: IMPLEMENT
4556         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4557         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4558         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4559         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4560         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4561 }
4562
4563
4564
4565 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4566 {
4567         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4568 }
4569
4570 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4571 {
4572         // TODO: IMPLEMENT
4573         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4574         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4575         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4576         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4577         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4578 }
4579
4580
4581
4582 typedef struct DPSOFTRAST_ShaderModeInfo_s
4583 {
4584         int lodarrayindex;
4585         void (*Vertex)(void);
4586         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4587         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4588         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4589 }
4590 DPSOFTRAST_ShaderModeInfo;
4591
4592 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4593 {
4594         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4595         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4596         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4597         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4598         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4599         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4600         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4601         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4602         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4603         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4604         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4605         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4606         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4607         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4608         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4609         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4610 };
4611
4612 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4613 {
4614         int i;
4615         int x;
4616         int startx;
4617         int endx;
4618 //      unsigned int c;
4619 //      unsigned int *colorpixel;
4620         unsigned int *depthpixel;
4621         float w;
4622         float wslope;
4623         int depth;
4624         int depthslope;
4625         unsigned int d;
4626         DPSOFTRAST_State_Triangle *triangle;
4627         DPSOFTRAST_State_Span *span;
4628         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4629         for (i = 0; i < thread->numspans; i++)
4630         {
4631                 span = &thread->spans[i];
4632                 triangle = &thread->triangles[span->triangle];
4633                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4634                 {
4635                         wslope = triangle->w[0];
4636                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4637                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4638                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4639                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4640                         startx = span->startx;
4641                         endx = span->endx;
4642                         switch(thread->fb_depthfunc)
4643                         {
4644                         default:
4645                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4646                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4647                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4648                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4649                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4650                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4651                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4652                         }
4653                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4654                         //for (x = startx;x < endx;x++)
4655                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4656                         // if there is no color buffer, skip pixel shader
4657                         while (startx < endx && !pixelmask[startx])
4658                                 startx++;
4659                         while (endx > startx && !pixelmask[endx-1])
4660                                 endx--;
4661                         if (startx >= endx)
4662                                 continue; // no pixels to fill
4663                         span->pixelmask = pixelmask;
4664                         span->startx = startx;
4665                         span->endx = endx;
4666                         // run pixel shader if appropriate
4667                         // do this before running depthmask code, to allow the pixelshader
4668                         // to clear pixelmask values for alpha testing
4669                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4670                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4671                         if (thread->depthmask)
4672                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4673                                         if (pixelmask[x])
4674                                                 depthpixel[x] = d;
4675                 }
4676                 else
4677                 {
4678                         // no depth testing means we're just dealing with color...
4679                         // if there is no color buffer, skip pixel shader
4680                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4681                         {
4682                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4683                                 span->pixelmask = pixelmask;
4684                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4685                         }
4686                 }
4687         }
4688         thread->numspans = 0;
4689 }
4690
4691 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4692
4693 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4694 {
4695 #ifdef SSE_POSSIBLE
4696         int cullface = thread->cullface;
4697         int minx, maxx, miny, maxy;
4698         int miny1, maxy1, miny2, maxy2;
4699         __m128i fbmin, fbmax;
4700         __m128 viewportcenter, viewportscale;
4701         int firstvertex = command->firstvertex;
4702         int numvertices = command->numvertices;
4703         int numtriangles = command->numtriangles;
4704         const int *element3i = command->element3i;
4705         const unsigned short *element3s = command->element3s;
4706         int clipped = command->clipped;
4707         int i;
4708         int j;
4709         int k;
4710         int y;
4711         int e[3];
4712         __m128i screeny;
4713         int starty, endy, bandy;
4714         int numpoints;
4715         int clipcase;
4716         float clipdist[4];
4717         __m128 triangleedge1, triangleedge2, trianglenormal;
4718         __m128 clipfrac[3];
4719         __m128 screen[4];
4720         DPSOFTRAST_State_Triangle *triangle;
4721         DPSOFTRAST_Texture *texture;
4722         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4723         miny = thread->fb_scissor[1];
4724         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4725         miny1 = bound(miny, thread->miny1, maxy);
4726         maxy1 = bound(miny, thread->maxy1, maxy);
4727         miny2 = bound(miny, thread->miny2, maxy);
4728         maxy2 = bound(miny, thread->maxy2, maxy);
4729         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4730         {
4731                 if (!ATOMIC_DECREMENT(command->refcount))
4732                 {
4733                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4734                                 MM_FREE(command->arrays);
4735                 }
4736                 return;
4737         }
4738         minx = thread->fb_scissor[0];
4739         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4740         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4741         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4742         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4743         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4744         screen[3] = _mm_setzero_ps();
4745         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4746         for (i = 0;i < numtriangles;i++)
4747         {
4748                 const float *screencoord4f = command->arrays;
4749                 const float *arrays = screencoord4f + numvertices*4;
4750
4751                 // generate the 3 edges of this triangle
4752                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4753                 if (element3s)
4754                 {
4755                         e[0] = element3s[i*3+0] - firstvertex;
4756                         e[1] = element3s[i*3+1] - firstvertex;
4757                         e[2] = element3s[i*3+2] - firstvertex;
4758                 }
4759                 else if (element3i)
4760                 {
4761                         e[0] = element3i[i*3+0] - firstvertex;
4762                         e[1] = element3i[i*3+1] - firstvertex;
4763                         e[2] = element3i[i*3+2] - firstvertex;
4764                 }
4765                 else
4766                 {
4767                         e[0] = i*3+0;
4768                         e[1] = i*3+1;
4769                         e[2] = i*3+2;
4770                 }
4771
4772 #define SKIPBACKFACE \
4773                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4774                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4775                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4776                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4777                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4778                 switch(cullface) \
4779                 { \
4780                 case GL_BACK: \
4781                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4782                                 continue; \
4783                         break; \
4784                 case GL_FRONT: \
4785                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4786                                 continue; \
4787                         break; \
4788                 }
4789
4790 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4791                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4792                         { \
4793                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4794                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4795                         }
4796 #define CLIPPEDVERTEXCOPY(k,p1) \
4797                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4798
4799 #define GENATTRIBCOPY(attrib, p1) \
4800                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4801 #define GENATTRIBLERP(attrib, p1, p2) \
4802                 { \
4803                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4804                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4805                 }
4806 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4807                 switch(clipcase) \
4808                 { \
4809                 default: \
4810                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4811                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4812                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4813                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4814                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4815                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4816                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4817                 }
4818
4819                 if (! clipped)
4820                         goto notclipped;
4821
4822                 // calculate distance from nearplane
4823                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4824                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4825                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4826                 if (clipdist[0] >= 0.0f)
4827                 {
4828                         if (clipdist[1] >= 0.0f)
4829                         {
4830                                 if (clipdist[2] >= 0.0f)
4831                                 {
4832                                 notclipped:
4833                                         // triangle is entirely in front of nearplane
4834                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4835                                         SKIPBACKFACE;
4836                                         numpoints = 3;
4837                                         clipcase = 0;
4838                                 }
4839                                 else
4840                                 {
4841                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4842                                         SKIPBACKFACE;
4843                                         numpoints = 4;
4844                                         clipcase = 1;
4845                                 }
4846                         }
4847                         else
4848                         {
4849                                 if (clipdist[2] >= 0.0f)
4850                                 {
4851                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4852                                         SKIPBACKFACE;
4853                                         numpoints = 4;
4854                                         clipcase = 2;
4855                                 }
4856                                 else
4857                                 {
4858                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4859                                         SKIPBACKFACE;
4860                                         numpoints = 3;
4861                                         clipcase = 3;
4862                                 }
4863                         }
4864                 }
4865                 else if (clipdist[1] >= 0.0f)
4866                 {
4867                         if (clipdist[2] >= 0.0f)
4868                         {
4869                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4870                                 SKIPBACKFACE;
4871                                 numpoints = 4;
4872                                 clipcase = 4;
4873                         }
4874                         else
4875                         {
4876                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4877                                 SKIPBACKFACE;
4878                                 numpoints = 3;
4879                                 clipcase = 5;
4880                         }
4881                 }
4882                 else if (clipdist[2] >= 0.0f)
4883                 {
4884                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4885                         SKIPBACKFACE;
4886                         numpoints = 3;
4887                         clipcase = 6;
4888                 }
4889                 else continue; // triangle is entirely behind nearplane
4890
4891                 {
4892                         // calculate integer y coords for triangle points
4893                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4894                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4895                                         screenmin = _mm_min_epi16(screeni, screenir),
4896                                         screenmax = _mm_max_epi16(screeni, screenir);
4897                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4898                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4899                         screenmin = _mm_max_epi16(screenmin, fbmin);
4900                         screenmax = _mm_min_epi16(screenmax, fbmax);
4901                         // skip offscreen triangles
4902                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4903                                 continue;
4904                         starty = _mm_extract_epi16(screenmin, 1);
4905                         endy = _mm_extract_epi16(screenmax, 1)+1;
4906                         if (starty >= maxy1 && endy <= miny2)
4907                                 continue;
4908                         screeny = _mm_srai_epi32(screeni, 16);
4909                 }
4910
4911                 triangle = &thread->triangles[thread->numtriangles];
4912
4913                 // calculate attribute plans for triangle data...
4914                 // okay, this triangle is going to produce spans, we'd better project
4915                 // the interpolants now (this is what gives perspective texturing),
4916                 // this consists of simply multiplying all arrays by the W coord
4917                 // (which is basically 1/Z), which will be undone per-pixel
4918                 // (multiplying by Z again) to get the perspective-correct array
4919                 // values
4920                 {
4921                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4922                         __m128 mipedgescale, mipdensity;
4923                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4924                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4925                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4926                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4927                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4928                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4929                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4930                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4931                         attribedge1 = _mm_sub_ss(w0, w1);
4932                         attribedge2 = _mm_sub_ss(w2, w1);
4933                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4934                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4935                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4936                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4937                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4938                         _mm_store_ss(&triangle->w[0], attribxslope);
4939                         _mm_store_ss(&triangle->w[1], attribyslope);
4940                         _mm_store_ss(&triangle->w[2], attriborigin);
4941                         mipedgescale = _mm_setzero_ps();
4942                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4943                         {
4944                                 __m128 attrib0, attrib1, attrib2;
4945                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4946                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4947                                         break;
4948                                 arrays += numvertices*4;
4949                                 GENATTRIBS(attrib0, attrib1, attrib2);
4950                                 attriborigin = _mm_mul_ps(attrib1, w1);
4951                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4952                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4953                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4954                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4955                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4956                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4957                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4958                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4959                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4960                                 {
4961                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4962                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4963                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4964                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4965                                 }
4966                         }
4967
4968                         memset(triangle->mip, 0, sizeof(triangle->mip));
4969                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4970                         {
4971                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4972                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4973                                         break;
4974                                 texture = thread->texbound[texunit];
4975                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4976                                 {
4977                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4978                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4979                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4980                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4981                                         // this will be multiplied in the texturing routine by the texture resolution
4982                                         y = _mm_cvtss_si32(mipdensity);
4983                                         if (y > 0)
4984                                         {
4985                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4986                                                 if (y > texture->mipmaps - 1)
4987                                                         y = texture->mipmaps - 1;
4988                                                 triangle->mip[texunit] = y;
4989                                         }
4990                                 }
4991                         }
4992                 }
4993         
4994                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4995                 for (; y < bandy;)
4996                 {
4997                         __m128 xcoords, xslope;
4998                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4999                         int yccmask = _mm_movemask_epi8(ycc);
5000                         int edge0p, edge0n, edge1p, edge1n;
5001                         int nexty;
5002                         if (numpoints == 4)
5003                         {
5004                                 switch(yccmask)
5005                                 {
5006                                 default:
5007                                 case 0xFFFF: /*0000*/ y = endy; continue;
5008                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5009                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5010                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5011                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5012                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5013                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5014                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5015                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5016                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5017                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5018                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5019                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5020                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5021                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5022                                 case 0x0000: /*1111*/ y++; continue;
5023                                 }
5024                         }
5025                         else
5026                         {
5027                                 switch(yccmask)
5028                                 {
5029                                 default:
5030                                 case 0xFFFF: /*000*/ y = endy; continue;
5031                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5032                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5033                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5034                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5035                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5036                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5037                                 case 0x0000: /*111*/ y++; continue;
5038                                 }
5039                         }
5040                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5041                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5042                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5043                         nexty = _mm_extract_epi16(ycc, 0);
5044                         if (nexty >= bandy) nexty = bandy-1;
5045                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5046                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5047                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5048                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5049                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5050                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5051                         {
5052                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5053                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5054                         }
5055                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5056                         {
5057                                 int startx, endx, offset;
5058                                 startx = _mm_cvtss_si32(xcoords);
5059                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5060                                 if (startx < minx) 
5061                                 {
5062                                         if (startx < 0) startx = 0;
5063                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5064                                 }
5065                                 if (endx > maxx) endx = maxx;
5066                                 if (startx >= endx) continue;
5067                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5068                                 {
5069                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5070                                         span->triangle = thread->numtriangles;
5071                                         span->x = offset;
5072                                         span->y = y;
5073                                         span->startx = max(minx - offset, 0);
5074                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5075                                         if (span->startx >= span->endx)
5076                                                 continue; 
5077                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5078                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5079                                 }
5080                         }
5081                 }
5082
5083                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5084                 {
5085                         DPSOFTRAST_Draw_ProcessSpans(thread);
5086                         thread->numtriangles = 0;
5087                 }
5088         }
5089
5090         if (!ATOMIC_DECREMENT(command->refcount))
5091         {
5092                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5093                         MM_FREE(command->arrays);
5094         }
5095
5096         if (thread->numspans > 0 || thread->numtriangles > 0)
5097         {
5098                 DPSOFTRAST_Draw_ProcessSpans(thread);
5099                 thread->numtriangles = 0;
5100         }
5101 #endif
5102 }
5103
5104 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5105 {
5106         int i;
5107         int j;
5108         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5109         int datasize = 2*numvertices*sizeof(float[4]);
5110         DPSOFTRAST_Command_Draw *command;
5111         unsigned char *data;
5112         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5113         {
5114                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5115                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5116                         break;
5117                 datasize += numvertices*sizeof(float[4]);
5118         }
5119         if (element3s)
5120                 datasize += numtriangles*sizeof(unsigned short[3]);
5121         else if (element3i)
5122                 datasize += numtriangles*sizeof(int[3]);
5123         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5124         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5125         {
5126                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5127                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5128         }
5129         else
5130         {
5131                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5132                 data = (unsigned char *)command + commandsize;
5133         }
5134         command->firstvertex = firstvertex;
5135         command->numvertices = numvertices;
5136         command->numtriangles = numtriangles;
5137         command->arrays = (float *)data;
5138         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5139         dpsoftrast.firstvertex = firstvertex;
5140         dpsoftrast.numvertices = numvertices;
5141         dpsoftrast.screencoord4f = (float *)data;
5142         data += numvertices*sizeof(float[4]);
5143         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5144         data += numvertices*sizeof(float[4]);
5145         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5146         {
5147                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5148                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5149                         break;
5150                 dpsoftrast.post_array4f[j] = (float *)data;
5151                 data += numvertices*sizeof(float[4]);
5152         }
5153         command->element3i = NULL;
5154         command->element3s = NULL;
5155         if (element3s)
5156         {
5157                 command->element3s = (unsigned short *)data;
5158                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5159         }
5160         else if (element3i)
5161         {
5162                 command->element3i = (int *)data;
5163                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5164         }
5165         return command;
5166 }
5167
5168 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5169 {
5170         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5171         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5172         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5173         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5174         if (command->starty >= command->endy)
5175         {
5176                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5177                         MM_FREE(command->arrays);
5178                 DPSOFTRAST_UndoCommand(command->commandsize);
5179                 return;
5180         }
5181         command->clipped = dpsoftrast.drawclipped;
5182         command->refcount = dpsoftrast.numthreads;
5183
5184         if (dpsoftrast.usethreads)
5185         {
5186                 int i;
5187                 DPSOFTRAST_Draw_SyncCommands();
5188                 for (i = 0; i < dpsoftrast.numthreads; i++)
5189                 {
5190                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5191                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5192                                 Thread_CondSignal(thread->drawcond);
5193                 }
5194         }
5195         else
5196         {
5197                 DPSOFTRAST_Draw_FlushThreads();
5198         }
5199 }
5200
5201 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5202 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5203 {
5204         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5205 }
5206 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5207 {
5208         DPSOFTRAST_Command_SetRenderTargets *command;
5209         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5210                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5211                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5212                 DPSOFTRAST_Flush();
5213         dpsoftrast.fb_width = width;
5214         dpsoftrast.fb_height = height;
5215         dpsoftrast.fb_depthpixels = depthpixels;
5216         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5217         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5218         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5219         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5220         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5221         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5222         command->width = width;
5223         command->height = height;
5224 }
5225  
5226 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5227 {
5228         int commandoffset = thread->commandoffset;
5229         while (commandoffset != endoffset)
5230         {
5231                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5232                 switch (command->opcode)
5233                 {
5234 #define INTERPCOMMAND(name) \
5235                 case DPSOFTRAST_OPCODE_##name : \
5236                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5237                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5238                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5239                                 commandoffset = 0; \
5240                         break;
5241                 INTERPCOMMAND(Viewport)
5242                 INTERPCOMMAND(ClearColor)
5243                 INTERPCOMMAND(ClearDepth)
5244                 INTERPCOMMAND(ColorMask)
5245                 INTERPCOMMAND(DepthTest)
5246                 INTERPCOMMAND(ScissorTest)
5247                 INTERPCOMMAND(Scissor)
5248                 INTERPCOMMAND(BlendFunc)
5249                 INTERPCOMMAND(BlendSubtract)
5250                 INTERPCOMMAND(DepthMask)
5251                 INTERPCOMMAND(DepthFunc)
5252                 INTERPCOMMAND(DepthRange)
5253                 INTERPCOMMAND(PolygonOffset)
5254                 INTERPCOMMAND(CullFace)
5255                 INTERPCOMMAND(AlphaTest)
5256                 INTERPCOMMAND(AlphaFunc)
5257                 INTERPCOMMAND(SetTexture)
5258                 INTERPCOMMAND(SetShader)
5259                 INTERPCOMMAND(Uniform4f)
5260                 INTERPCOMMAND(UniformMatrix4f)
5261                 INTERPCOMMAND(Uniform1i)
5262                 INTERPCOMMAND(SetRenderTargets)
5263
5264                 case DPSOFTRAST_OPCODE_Draw:
5265                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5266                         commandoffset += command->commandsize;
5267                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5268                                 commandoffset = 0;
5269                         thread->commandoffset = commandoffset;
5270                         break;
5271
5272                 case DPSOFTRAST_OPCODE_Reset:
5273                         commandoffset = 0;
5274                         break;
5275                 }
5276         }
5277         thread->commandoffset = commandoffset;
5278 }
5279
5280 static int DPSOFTRAST_Draw_Thread(void *data)
5281 {
5282         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5283         while(thread->index >= 0)
5284         {
5285                 if (thread->commandoffset != dpsoftrast.drawcommand)
5286                 {
5287                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5288                 }
5289                 else 
5290                 {
5291                         Thread_LockMutex(thread->drawmutex);
5292                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5293                         {
5294                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5295                                 thread->starving = true;
5296                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5297                                 thread->starving = false;
5298                         }
5299                         Thread_UnlockMutex(thread->drawmutex);
5300                 }
5301         }   
5302         return 0;
5303 }
5304
5305 static void DPSOFTRAST_Draw_FlushThreads(void)
5306 {
5307         DPSOFTRAST_State_Thread *thread;
5308         int i;
5309         DPSOFTRAST_Draw_SyncCommands();
5310         if (dpsoftrast.usethreads) 
5311         {
5312                 for (i = 0; i < dpsoftrast.numthreads; i++)
5313                 {
5314                         thread = &dpsoftrast.threads[i];
5315                         if (thread->commandoffset != dpsoftrast.drawcommand)
5316                         {
5317                                 Thread_LockMutex(thread->drawmutex);
5318                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5319                                         Thread_CondSignal(thread->drawcond);
5320                                 Thread_UnlockMutex(thread->drawmutex);
5321                         }
5322                 }
5323                 for (i = 0; i < dpsoftrast.numthreads; i++)
5324                 {
5325                         thread = &dpsoftrast.threads[i];
5326                         if (thread->commandoffset != dpsoftrast.drawcommand)
5327                         {
5328                                 Thread_LockMutex(thread->drawmutex);
5329                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5330                                 {
5331                                         thread->waiting = true;
5332                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5333                                         thread->waiting = false;
5334                                 }
5335                                 Thread_UnlockMutex(thread->drawmutex);
5336                         }
5337                 }
5338         }
5339         else
5340         {
5341                 for (i = 0; i < dpsoftrast.numthreads; i++)
5342                 {
5343                         thread = &dpsoftrast.threads[i];
5344                         if (thread->commandoffset != dpsoftrast.drawcommand)
5345                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5346                 }
5347         }
5348         dpsoftrast.commandpool.usedcommands = 0;
5349 }
5350
5351 void DPSOFTRAST_Flush(void)
5352 {
5353         DPSOFTRAST_Draw_FlushThreads();
5354 }
5355
5356 void DPSOFTRAST_Finish(void)
5357 {
5358         DPSOFTRAST_Flush();
5359 }
5360
5361 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5362 {
5363         int i;
5364         union
5365         {
5366                 int i;
5367                 unsigned char b[4];
5368         }
5369         u;
5370         u.i = 1;
5371         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5372         dpsoftrast.bigendian = u.b[3];
5373         dpsoftrast.fb_width = width;
5374         dpsoftrast.fb_height = height;
5375         dpsoftrast.fb_depthpixels = depthpixels;
5376         dpsoftrast.fb_colorpixels[0] = colorpixels;
5377         dpsoftrast.fb_colorpixels[1] = NULL;
5378         dpsoftrast.fb_colorpixels[1] = NULL;
5379         dpsoftrast.fb_colorpixels[1] = NULL;
5380         dpsoftrast.viewport[0] = 0;
5381         dpsoftrast.viewport[1] = 0;
5382         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5383         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5384         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5385         dpsoftrast.texture_firstfree = 1;
5386         dpsoftrast.texture_end = 1;
5387         dpsoftrast.texture_max = 0;
5388         dpsoftrast.color[0] = 1;
5389         dpsoftrast.color[1] = 1;
5390         dpsoftrast.color[2] = 1;
5391         dpsoftrast.color[3] = 1;
5392         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5393         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5394         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5395         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5396         for (i = 0; i < dpsoftrast.numthreads; i++)
5397         {
5398                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5399                 thread->index = i;
5400                 thread->cullface = GL_BACK;
5401                 thread->colormask[1] = 1;
5402                 thread->colormask[2] = 1;
5403                 thread->colormask[3] = 1;
5404                 thread->blendfunc[0] = GL_ONE;
5405                 thread->blendfunc[1] = GL_ZERO;
5406                 thread->depthmask = true;
5407                 thread->depthtest = true;
5408                 thread->depthfunc = GL_LEQUAL;
5409                 thread->scissortest = false;
5410                 thread->alphatest = false;
5411                 thread->alphafunc = GL_GREATER;
5412                 thread->alphavalue = 0.5f;
5413                 thread->viewport[0] = 0;
5414                 thread->viewport[1] = 0;
5415                 thread->viewport[2] = dpsoftrast.fb_width;
5416                 thread->viewport[3] = dpsoftrast.fb_height;
5417                 thread->scissor[0] = 0;
5418                 thread->scissor[1] = 0;
5419                 thread->scissor[2] = dpsoftrast.fb_width;
5420                 thread->scissor[3] = dpsoftrast.fb_height;
5421                 thread->depthrange[0] = 0;
5422                 thread->depthrange[1] = 1;
5423                 thread->polygonoffset[0] = 0;
5424                 thread->polygonoffset[1] = 0;
5425         
5426                 DPSOFTRAST_RecalcThread(thread);
5427         
5428                 thread->numspans = 0;
5429                 thread->numtriangles = 0;
5430                 thread->commandoffset = 0;
5431                 thread->waiting = false;
5432                 thread->starving = false;
5433            
5434                 thread->validate = -1;
5435                 DPSOFTRAST_Validate(thread, -1);
5436  
5437                 if (dpsoftrast.usethreads)
5438                 {
5439                         thread->waitcond = Thread_CreateCond();
5440                         thread->drawcond = Thread_CreateCond();
5441                         thread->drawmutex = Thread_CreateMutex();
5442                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5443                 }
5444         }
5445         return 0;
5446 }
5447
5448 void DPSOFTRAST_Shutdown(void)
5449 {
5450         int i;
5451         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5452         {
5453                 DPSOFTRAST_State_Thread *thread;
5454                 for (i = 0; i < dpsoftrast.numthreads; i++)
5455                 {
5456                         thread = &dpsoftrast.threads[i];
5457                         Thread_LockMutex(thread->drawmutex);
5458                         thread->index = -1;
5459                         Thread_CondSignal(thread->drawcond);
5460                         Thread_UnlockMutex(thread->drawmutex);
5461                         Thread_WaitThread(thread->thread, 0);
5462                         Thread_DestroyCond(thread->waitcond);
5463                         Thread_DestroyCond(thread->drawcond);
5464                         Thread_DestroyMutex(thread->drawmutex);
5465                 }
5466         }
5467         for (i = 0;i < dpsoftrast.texture_end;i++)
5468                 if (dpsoftrast.texture[i].bytes)
5469                         MM_FREE(dpsoftrast.texture[i].bytes);
5470         if (dpsoftrast.texture)
5471                 free(dpsoftrast.texture);
5472         if (dpsoftrast.threads)
5473                 MM_FREE(dpsoftrast.threads);
5474         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5475 }
5476