]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
slight optimization to 2D rendering - if triangle wslope is 0, don't
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238
239         int shader_mode;
240         int shader_permutation;
241         int shader_exactspecularmath;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_scissor[4];
254         ALIGN(float fb_viewportcenter[4]);
255         ALIGN(float fb_viewportscale[4]);
256
257         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258         int fb_depthfunc;
259
260         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261         int fb_blendmode;
262
263         // band boundaries
264         int miny1;
265         int maxy1;
266         int miny2;
267         int maxy2;
268
269         ATOMIC(volatile int commandoffset);
270
271         volatile bool waiting;
272         volatile bool starving;
273         void *waitcond;
274         void *drawcond;
275         void *drawmutex;
276
277         int numspans;
278         int numtriangles;
279         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
281 }
282 DPSOFTRAST_State_Thread);
283
284 typedef ATOMIC(struct DPSOFTRAST_State_s
285 {
286         int fb_width;
287         int fb_height;
288         unsigned int *fb_depthpixels;
289         unsigned int *fb_colorpixels[4];
290
291         int viewport[4];
292         ALIGN(float fb_viewportcenter[4]);
293         ALIGN(float fb_viewportscale[4]);
294
295         float color[4];
296         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
298
299         const float *pointer_vertex3f;
300         const float *pointer_color4f;
301         const unsigned char *pointer_color4ub;
302         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         int stride_vertex;
304         int stride_color;
305         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
308
309         int firstvertex;
310         int numvertices;
311         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312         float *screencoord4f;
313         int drawstarty;
314         int drawendy;
315         int drawclipped;
316         
317         int shader_mode;
318         int shader_permutation;
319         int shader_exactspecularmath;
320
321         int texture_max;
322         int texture_end;
323         int texture_firstfree;
324         DPSOFTRAST_Texture *texture;
325
326         int bigendian;
327
328         // error reporting
329         const char *errorstring;
330
331         bool usethreads;
332         int interlace;
333         int numthreads;
334         DPSOFTRAST_State_Thread *threads;
335
336         ATOMIC(volatile int drawcommand);
337
338         DPSOFTRAST_State_Command_Pool commandpool;
339 }
340 DPSOFTRAST_State);
341
342 DPSOFTRAST_State dpsoftrast;
343
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
349
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
351 {
352         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354         fb_viewportcenter[3] = 0.5f;
355         fb_viewportcenter[0] = 0.0f;
356         fb_viewportscale[1] = 0.5f * viewport[2];
357         fb_viewportscale[2] = -0.5f * viewport[3];
358         fb_viewportscale[3] = 0.5f;
359         fb_viewportscale[0] = 1.0f;
360 }
361
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
363 {
364         if (dpsoftrast.interlace)
365         {
366                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370         }
371         else
372         {
373                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375         }
376 }
377
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
379 {
380         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381         // and viewport projection values
382         int x1, x2;
383         int y1, y2;
384         x1 = thread->scissor[0];
385         x2 = thread->scissor[0] + thread->scissor[2];
386         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387         y2 = dpsoftrast.fb_height - thread->scissor[1];
388         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
389         if (x1 < 0) x1 = 0;
390         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
391         if (y1 < 0) y1 = 0;
392         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393         thread->fb_scissor[0] = x1;
394         thread->fb_scissor[1] = y1;
395         thread->fb_scissor[2] = x2 - x1;
396         thread->fb_scissor[3] = y2 - y1;
397
398         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399         DPSOFTRAST_RecalcThread(thread);
400 }
401
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
403 {
404         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
405 }
406
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
408 {
409         if (thread->blendsubtract)
410         {
411                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
412                 {
413                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417                 }
418         }
419         else
420         {       
421                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
422                 {
423                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434                 }
435         }
436 }
437
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
439
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
441 {
442         mask &= thread->validate;
443         if (!mask)
444                 return;
445         if (mask & DPSOFTRAST_VALIDATE_FB)
446         {
447                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448                 DPSOFTRAST_RecalcFB(thread);
449         }
450         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
451         {
452                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453                 DPSOFTRAST_RecalcDepthFunc(thread);
454         }
455         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
456         {
457                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458                 DPSOFTRAST_RecalcBlendFunc(thread);
459         }
460 }
461
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
463 {
464         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465                 return &dpsoftrast.texture[index];
466         return NULL;
467 }
468
469 static void DPSOFTRAST_Texture_Grow(void)
470 {
471         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472         DPSOFTRAST_State_Thread *thread;
473         int i;
474         int j;
475         DPSOFTRAST_Flush();
476         // expand texture array as needed
477         if (dpsoftrast.texture_max < 1024)
478                 dpsoftrast.texture_max = 1024;
479         else
480                 dpsoftrast.texture_max *= 2;
481         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483                 if (dpsoftrast.texbound[i])
484                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485         for (j = 0; j < dpsoftrast.numthreads; j++)
486         {
487                 thread = &dpsoftrast.threads[j];
488                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489                         if (thread->texbound[i])
490                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
491         }
492 }
493
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
495 {
496         int w;
497         int h;
498         int d;
499         int size;
500         int s;
501         int texnum;
502         int mipmaps;
503         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505         DPSOFTRAST_Texture *texture;
506         if (width*height*depth < 1)
507         {
508                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
509                 return 0;
510         }
511         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
512         {
513                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514                 return 0;
515         }
516         switch(texformat)
517         {
518         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
521                 break;
522         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
524                 {
525                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526                         return 0;
527                 }
528                 if (depth != 1)
529                 {
530                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531                         return 0;
532                 }
533                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
534                 {
535                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536                         return 0;
537                 }
538                 break;
539         }
540         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
541         {
542                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
543                 return 0;
544         }
545         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
546         {
547                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
548                 return 0;
549         }
550         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
551         {
552                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
553                 return 0;
554         }
555         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
556         {
557                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
558                 return 0;
559         }
560         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
561         {
562                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
563                 return 0;
564         }
565         // find first empty slot in texture array
566         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567                 if (!dpsoftrast.texture[texnum].bytes)
568                         break;
569         dpsoftrast.texture_firstfree = texnum + 1;
570         if (dpsoftrast.texture_max <= texnum)
571                 DPSOFTRAST_Texture_Grow();
572         if (dpsoftrast.texture_end <= texnum)
573                 dpsoftrast.texture_end = texnum + 1;
574         texture = &dpsoftrast.texture[texnum];
575         memset(texture, 0, sizeof(*texture));
576         texture->flags = flags;
577         texture->width = width;
578         texture->height = height;
579         texture->depth = depth;
580         texture->sides = sides;
581         texture->binds = 0;
582         w = width;
583         h = height;
584         d = depth;
585         size = 0;
586         mipmaps = 0;
587         w = width;
588         h = height;
589         d = depth;
590         for (;;)
591         {
592                 s = w * h * d * sides * 4;
593                 texture->mipmap[mipmaps][0] = size;
594                 texture->mipmap[mipmaps][1] = s;
595                 texture->mipmap[mipmaps][2] = w;
596                 texture->mipmap[mipmaps][3] = h;
597                 texture->mipmap[mipmaps][4] = d;
598                 size += s;
599                 mipmaps++;
600                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
601                         break;
602                 if (w > 1) w >>= 1;
603                 if (h > 1) h >>= 1;
604                 if (d > 1) d >>= 1;
605         }
606         texture->mipmaps = mipmaps;
607         texture->size = size;
608
609         // allocate the pixels now
610         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
611
612         return texnum;
613 }
614 void DPSOFTRAST_Texture_Free(int index)
615 {
616         DPSOFTRAST_Texture *texture;
617         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
618         if (texture->binds)
619                 DPSOFTRAST_Flush();
620         if (texture->bytes)
621                 MM_FREE(texture->bytes);
622         texture->bytes = NULL;
623         memset(texture, 0, sizeof(*texture));
624         // adjust the free range and used range
625         if (dpsoftrast.texture_firstfree > index)
626                 dpsoftrast.texture_firstfree = index;
627         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628                 dpsoftrast.texture_end--;
629 }
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
631 {
632         int i, x, y, z, w, layer0, layer1, row0, row1;
633         unsigned char *o, *i0, *i1, *i2, *i3;
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->mipmaps <= 1)
637                 return;
638         for (i = 1;i < texture->mipmaps;i++)
639         {
640                 for (z = 0;z < texture->mipmap[i][4];z++)
641                 {
642                         layer0 = z*2;
643                         layer1 = z*2+1;
644                         if (layer1 >= texture->mipmap[i-1][4])
645                                 layer1 = texture->mipmap[i-1][4]-1;
646                         for (y = 0;y < texture->mipmap[i][3];y++)
647                         {
648                                 row0 = y*2;
649                                 row1 = y*2+1;
650                                 if (row1 >= texture->mipmap[i-1][3])
651                                         row1 = texture->mipmap[i-1][3]-1;
652                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
653                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657                                 w = texture->mipmap[i][2];
658                                 if (layer1 > layer0)
659                                 {
660                                         if (texture->mipmap[i-1][2] > 1)
661                                         {
662                                                 // average 3D texture
663                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
664                                                 {
665                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669                                                 }
670                                         }
671                                         else
672                                         {
673                                                 // average 3D mipmap with parent width == 1
674                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
675                                                 {
676                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
680                                                 }
681                                         }
682                                 }
683                                 else
684                                 {
685                                         if (texture->mipmap[i-1][2] > 1)
686                                         {
687                                                 // average 2D texture (common case)
688                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
689                                                 {
690                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694                                                 }
695                                         }
696                                         else
697                                         {
698                                                 // 2D texture with parent width == 1
699                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
700                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
701                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
702                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
703                                         }
704                                 }
705                         }
706                 }
707         }
708 }
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
710 {
711         DPSOFTRAST_Texture *texture;
712         unsigned char *dst;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714         if (texture->binds)
715                 DPSOFTRAST_Flush();
716         if (pixels)
717         {
718                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719                 while (blockheight > 0)
720                 {
721                         memcpy(dst, pixels, blockwidth * 4);
722                         pixels += blockwidth * 4;
723                         dst += texture->mipmap[0][2] * 4;
724                         blockheight--;
725                 }
726         }
727         DPSOFTRAST_Texture_CalculateMipmaps(index);
728 }
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         if (pixels)
736                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737         DPSOFTRAST_Texture_CalculateMipmaps(index);
738 }
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
740 {
741         DPSOFTRAST_Texture *texture;
742         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743         return texture->mipmap[mip][2];
744 }
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
746 {
747         DPSOFTRAST_Texture *texture;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749         return texture->mipmap[mip][3];
750 }
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][4];
756 }
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         if (texture->binds)
762                 DPSOFTRAST_Flush();
763         return texture->bytes + texture->mipmap[mip][0];
764 }
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
770         {
771                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772                 return;
773         }
774         if (texture->binds)
775                 DPSOFTRAST_Flush();
776         texture->filter = filter;
777 }
778
779 static void DPSOFTRAST_Draw_FlushThreads(void);
780
781 static void DPSOFTRAST_Draw_SyncCommands(void)
782 {
783         if(dpsoftrast.usethreads) MEMORY_BARRIER;
784         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
785 }
786
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
788 {
789         DPSOFTRAST_State_Thread *thread;
790         int i;
791         int freecommand = dpsoftrast.commandpool.freecommand;
792         int usedcommands = dpsoftrast.commandpool.usedcommands;
793         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
794                 return;
795         DPSOFTRAST_Draw_SyncCommands();
796         for(;;)
797         {
798                 int waitindex = -1;
799                 int commandoffset;
800                 usedcommands = 0;
801                 for (i = 0; i < dpsoftrast.numthreads; i++)
802                 {
803                         thread = &dpsoftrast.threads[i]; 
804                         commandoffset = freecommand - thread->commandoffset;
805                         if (commandoffset < 0)
806                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807                         if (commandoffset > usedcommands)
808                         {
809                                 waitindex = i;
810                                 usedcommands = commandoffset;
811                         }
812                 }
813                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
814                         break;
815                 thread = &dpsoftrast.threads[waitindex];
816                 Thread_LockMutex(thread->drawmutex);
817                 if (thread->commandoffset != dpsoftrast.drawcommand)
818                 {
819                         thread->waiting = true;
820                         if (thread->starving) Thread_CondSignal(thread->drawcond);
821                         Thread_CondWait(thread->waitcond, thread->drawmutex);
822                         thread->waiting = false;
823                 }
824                 Thread_UnlockMutex(thread->drawmutex);
825         }
826         dpsoftrast.commandpool.usedcommands = usedcommands;
827 }
828
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
833
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
835 {
836         DPSOFTRAST_Command *command;
837         int freecommand = dpsoftrast.commandpool.freecommand;
838         int usedcommands = dpsoftrast.commandpool.usedcommands;
839         int extra = sizeof(DPSOFTRAST_Command);
840         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
843         {
844                 if (dpsoftrast.usethreads)
845                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
846                 else
847                         DPSOFTRAST_Draw_FlushThreads();
848                 freecommand = dpsoftrast.commandpool.freecommand;
849                 usedcommands = dpsoftrast.commandpool.usedcommands;
850         }
851         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
852         {
853                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854                 command->opcode = DPSOFTRAST_OPCODE_Reset;
855                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
856                 freecommand = 0;
857         }
858         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859         command->opcode = opcode;
860         command->commandsize = size;
861         freecommand += size;
862         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
863                 freecommand = 0;
864         dpsoftrast.commandpool.freecommand = freecommand;
865         dpsoftrast.commandpool.usedcommands = usedcommands + size;
866         return command;
867 }
868
869 static void DPSOFTRAST_UndoCommand(int size)
870 {
871         int freecommand = dpsoftrast.commandpool.freecommand;
872         int usedcommands = dpsoftrast.commandpool.usedcommands;
873         freecommand -= size;
874         if (freecommand < 0)
875                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876         usedcommands -= size;
877         dpsoftrast.commandpool.freecommand = freecommand;
878         dpsoftrast.commandpool.usedcommands = usedcommands;
879 }
880                 
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
883 {
884         thread->viewport[0] = command->x;
885         thread->viewport[1] = command->y;
886         thread->viewport[2] = command->width;
887         thread->viewport[3] = command->height;
888         thread->validate |= DPSOFTRAST_VALIDATE_FB;
889 }
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
891 {
892         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
893         command->x = x;
894         command->y = y;
895         command->width = width;
896         command->height = height;
897
898         dpsoftrast.viewport[0] = x;
899         dpsoftrast.viewport[1] = y;
900         dpsoftrast.viewport[2] = width;
901         dpsoftrast.viewport[3] = height;
902         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
903 }
904
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
907 {
908         int i, x1, y1, x2, y2, w, h, x, y;
909         int miny1, maxy1, miny2, maxy2;
910         int bandy;
911         unsigned int *p;
912         unsigned int c;
913         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914         miny1 = thread->miny1;
915         maxy1 = thread->maxy1;
916         miny2 = thread->miny2;
917         maxy2 = thread->maxy2;
918         x1 = thread->fb_scissor[0];
919         y1 = thread->fb_scissor[1];
920         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922         if (y1 < miny1) y1 = miny1;
923         if (y2 > maxy2) y2 = maxy2;
924         w = x2 - x1;
925         h = y2 - y1;
926         if (w < 1 || h < 1)
927                 return;
928         // FIXME: honor fb_colormask?
929         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930         for (i = 0;i < 4;i++)
931         {
932                 if (!dpsoftrast.fb_colorpixels[i])
933                         continue;
934                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
935                 for (;y < bandy;y++)
936                 {
937                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938                         for (x = x1;x < x2;x++)
939                                 p[x] = c;
940                 }
941         }
942 }
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
944 {
945         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
946         command->r = r;
947         command->g = g;
948         command->b = b;
949         command->a = a;
950 }
951
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
954 {
955         int x1, y1, x2, y2, w, h, x, y;
956         int miny1, maxy1, miny2, maxy2;
957         int bandy;
958         unsigned int *p;
959         unsigned int c;
960         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961         miny1 = thread->miny1;
962         maxy1 = thread->maxy1;
963         miny2 = thread->miny2;
964         maxy2 = thread->maxy2;
965         x1 = thread->fb_scissor[0];
966         y1 = thread->fb_scissor[1];
967         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969         if (y1 < miny1) y1 = miny1;
970         if (y2 > maxy2) y2 = maxy2;
971         w = x2 - x1;
972         h = y2 - y1;
973         if (w < 1 || h < 1)
974                 return;
975         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977         for (;y < bandy;y++)
978         {
979                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980                 for (x = x1;x < x2;x++)
981                         p[x] = c;
982         }
983 }
984 void DPSOFTRAST_ClearDepth(float d)
985 {
986         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
987         command->depth = d;
988 }
989
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
992 {
993         thread->colormask[0] = command->r != 0;
994         thread->colormask[1] = command->g != 0;
995         thread->colormask[2] = command->b != 0;
996         thread->colormask[3] = command->a != 0;
997         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
998 }
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1000 {
1001         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1002         command->r = r;
1003         command->g = g;
1004         command->b = b;
1005         command->a = a;
1006 }
1007
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1010 {
1011         thread->depthtest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1013 }
1014 void DPSOFTRAST_DepthTest(int enable)
1015 {
1016         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1022 {
1023         thread->scissortest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 }
1026 void DPSOFTRAST_ScissorTest(int enable)
1027 {
1028         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1034 {
1035         thread->scissor[0] = command->x;
1036         thread->scissor[1] = command->y;
1037         thread->scissor[2] = command->width;
1038         thread->scissor[3] = command->height;
1039         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1040 }
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1042 {
1043         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1044         command->x = x;
1045         command->y = y;
1046         command->width = width;
1047         command->height = height;
1048 }
1049
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1052 {
1053         thread->blendfunc[0] = command->sfactor;
1054         thread->blendfunc[1] = command->dfactor;
1055         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 }
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1058 {
1059         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060         command->sfactor = sfactor;
1061         command->dfactor = dfactor;
1062 }
1063
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1066 {
1067         thread->blendsubtract = command->enable;
1068         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 }
1070 void DPSOFTRAST_BlendSubtract(int enable)
1071 {
1072         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073         command->enable = enable;
1074 }
1075
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1078 {
1079         thread->depthmask = command->enable;
1080 }
1081 void DPSOFTRAST_DepthMask(int enable)
1082 {
1083         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084         command->enable = enable;
1085 }
1086
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1089 {
1090         thread->depthfunc = command->func;
1091 }
1092 void DPSOFTRAST_DepthFunc(int func)
1093 {
1094         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095         command->func = func;
1096 }
1097
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1100 {
1101         thread->depthrange[0] = command->nearval;
1102         thread->depthrange[1] = command->farval;
1103 }
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1105 {
1106         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107         command->nearval = nearval;
1108         command->farval = farval;
1109 }
1110
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1113 {
1114         thread->polygonoffset[0] = command->alongnormal;
1115         thread->polygonoffset[1] = command->intoview;
1116 }
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1118 {
1119         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120         command->alongnormal = alongnormal;
1121         command->intoview = intoview;
1122 }
1123
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1126 {
1127         thread->cullface = command->mode;
1128 }
1129 void DPSOFTRAST_CullFace(int mode)
1130 {
1131         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132         command->mode = mode;
1133 }
1134
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1137 {
1138         thread->alphatest = command->enable;
1139 }
1140 void DPSOFTRAST_AlphaTest(int enable)
1141 {
1142         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143         command->enable = enable;
1144 }
1145
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1148 {
1149         thread->alphafunc = command->func;
1150         thread->alphavalue = command->ref;
1151 }
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1153 {
1154         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155         command->func = func;
1156         command->ref = ref;
1157 }
1158
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1160 {
1161         dpsoftrast.color[0] = r;
1162         dpsoftrast.color[1] = g;
1163         dpsoftrast.color[2] = b;
1164         dpsoftrast.color[3] = a;
1165 }
1166
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1168 {
1169         int outstride = blockwidth * 4;
1170         int instride = dpsoftrast.fb_width * 4;
1171         int bx1 = blockx;
1172         int by1 = blocky;
1173         int bx2 = blockx + blockwidth;
1174         int by2 = blocky + blockheight;
1175         int bw;
1176         int x;
1177         int y;
1178         unsigned char *inpixels;
1179         unsigned char *b;
1180         unsigned char *o;
1181         DPSOFTRAST_Flush();
1182         if (bx1 < 0) bx1 = 0;
1183         if (by1 < 0) by1 = 0;
1184         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1186         bw = bx2 - bx1;
1187         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188         if (dpsoftrast.bigendian)
1189         {
1190                 for (y = by1;y < by2;y++)
1191                 {
1192                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1194                         for (x = bx1;x < bx2;x++)
1195                         {
1196                                 o[0] = b[3];
1197                                 o[1] = b[2];
1198                                 o[2] = b[1];
1199                                 o[3] = b[0];
1200                                 o += 4;
1201                                 b += 4;
1202                         }
1203                 }
1204         }
1205         else
1206         {
1207                 for (y = by1;y < by2;y++)
1208                 {
1209                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1211                         memcpy(o, b, bw*4);
1212                 }
1213         }
1214
1215 }
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1217 {
1218         int tx1 = tx;
1219         int ty1 = ty;
1220         int tx2 = tx + width;
1221         int ty2 = ty + height;
1222         int sx1 = sx;
1223         int sy1 = sy;
1224         int sx2 = sx + width;
1225         int sy2 = sy + height;
1226         int swidth;
1227         int sheight;
1228         int twidth;
1229         int theight;
1230         int sw;
1231         int sh;
1232         int tw;
1233         int th;
1234         int y;
1235         unsigned int *spixels;
1236         unsigned int *tpixels;
1237         DPSOFTRAST_Texture *texture;
1238         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239         if (mip < 0 || mip >= texture->mipmaps) return;
1240         DPSOFTRAST_Flush();
1241         spixels = dpsoftrast.fb_colorpixels[0];
1242         swidth = dpsoftrast.fb_width;
1243         sheight = dpsoftrast.fb_height;
1244         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245         twidth = texture->mipmap[mip][2];
1246         theight = texture->mipmap[mip][3];
1247         if (tx1 < 0) tx1 = 0;
1248         if (ty1 < 0) ty1 = 0;
1249         if (tx2 > twidth) tx2 = twidth;
1250         if (ty2 > theight) ty2 = theight;
1251         if (sx1 < 0) sx1 = 0;
1252         if (sy1 < 0) sy1 = 0;
1253         if (sx2 > swidth) sx2 = swidth;
1254         if (sy2 > sheight) sy2 = sheight;
1255         tw = tx2 - tx1;
1256         th = ty2 - ty1;
1257         sw = sx2 - sx1;
1258         sh = sy2 - sy1;
1259         if (tw > sw) tw = sw;
1260         if (th > sh) th = sh;
1261         if (tw < 1 || th < 1)
1262                 return;
1263         sy1 = sheight - 1 - sy1;
1264         for (y = 0;y < th;y++)
1265                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266         if (texture->mipmaps > 1)
1267                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1268 }
1269
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1272 {
1273         if (thread->texbound[command->unitnum])
1274                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275         thread->texbound[command->unitnum] = command->texture;
1276 }
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1278 {
1279         DPSOFTRAST_Command_SetTexture *command;
1280         DPSOFTRAST_Texture *texture;
1281         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1282         {
1283                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1284                 return;
1285         }
1286         texture = DPSOFTRAST_Texture_GetByIndex(index);
1287         if (index && !texture)
1288         {
1289                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1290                 return;
1291         }
1292
1293         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294         command->unitnum = unitnum;
1295         command->texture = texture;
1296
1297         dpsoftrast.texbound[unitnum] = texture;
1298         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1299 }
1300
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1302 {
1303         dpsoftrast.pointer_vertex3f = vertex3f;
1304         dpsoftrast.stride_vertex = stride;
1305 }
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1307 {
1308         dpsoftrast.pointer_color4f = color4f;
1309         dpsoftrast.pointer_color4ub = NULL;
1310         dpsoftrast.stride_color = stride;
1311 }
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1313 {
1314         dpsoftrast.pointer_color4f = NULL;
1315         dpsoftrast.pointer_color4ub = color4ub;
1316         dpsoftrast.stride_color = stride;
1317 }
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1319 {
1320         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322         dpsoftrast.stride_texcoord[unitnum] = stride;
1323 }
1324
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1327 {
1328         thread->shader_mode = command->mode;
1329         thread->shader_permutation = command->permutation;
1330         thread->shader_exactspecularmath = command->exactspecularmath;
1331 }
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1333 {
1334         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335         command->mode = mode;
1336         command->permutation = permutation;
1337         command->exactspecularmath = exactspecularmath;
1338
1339         dpsoftrast.shader_mode = mode;
1340         dpsoftrast.shader_permutation = permutation;
1341         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1342 }
1343
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1346 {
1347         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1348 }
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1350 {
1351         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352         command->index = index;
1353         command->val[0] = v0;
1354         command->val[1] = v1;
1355         command->val[2] = v2;
1356         command->val[3] = v3;
1357
1358         dpsoftrast.uniform4f[index*4+0] = v0;
1359         dpsoftrast.uniform4f[index*4+1] = v1;
1360         dpsoftrast.uniform4f[index*4+2] = v2;
1361         dpsoftrast.uniform4f[index*4+3] = v3;
1362 }
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1364 {
1365         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366         command->index = index;
1367         memcpy(command->val, v, sizeof(command->val));
1368
1369         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1370 }
1371
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1374 {
1375         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 }
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1378 {
1379 #ifdef SSE_POSSIBLE
1380         int i, index;
1381         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1382         {
1383                 __m128 m0, m1, m2, m3;
1384                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385                 command->index = (DPSOFTRAST_UNIFORM)index;
1386                 if (((size_t)v)&(ALIGN_SIZE-1))
1387                 {
1388                         m0 = _mm_loadu_ps(v);
1389                         m1 = _mm_loadu_ps(v+4);
1390                         m2 = _mm_loadu_ps(v+8);
1391                         m3 = _mm_loadu_ps(v+12);
1392                 }
1393                 else
1394                 {
1395                         m0 = _mm_load_ps(v);
1396                         m1 = _mm_load_ps(v+4);
1397                         m2 = _mm_load_ps(v+8);
1398                         m3 = _mm_load_ps(v+12);
1399                 }
1400                 if (transpose)
1401                 {
1402                         __m128 t0, t1, t2, t3;
1403                         t0 = _mm_unpacklo_ps(m0, m1);
1404                         t1 = _mm_unpacklo_ps(m2, m3);
1405                         t2 = _mm_unpackhi_ps(m0, m1);
1406                         t3 = _mm_unpackhi_ps(m2, m3);
1407                         m0 = _mm_movelh_ps(t0, t1);
1408                         m1 = _mm_movehl_ps(t1, t0);
1409                         m2 = _mm_movelh_ps(t2, t3);
1410                         m3 = _mm_movehl_ps(t3, t2);                     
1411                 }
1412                 _mm_store_ps(command->val, m0);
1413                 _mm_store_ps(command->val+4, m1);
1414                 _mm_store_ps(command->val+8, m2);
1415                 _mm_store_ps(command->val+12, m3);
1416                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420         }
1421 #endif
1422 }
1423
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1426 {
1427         thread->uniform1i[command->index] = command->val;
1428 }
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1430 {
1431         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432         command->index = index;
1433         command->val = i0;
1434
1435         dpsoftrast.uniform1i[command->index] = i0;
1436 }
1437
1438 #ifdef SSE_POSSIBLE
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 {
1441         float *end = dst + size*4;
1442         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1443         {
1444                 while (dst < end)
1445                 {
1446                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447                         dst += 4;
1448                         src += stride;
1449                 }
1450         }
1451         else
1452         {
1453                 while (dst < end)
1454                 {
1455                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1456                         dst += 4;
1457                         src += stride;
1458                 }
1459         }
1460 }
1461
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1463 {
1464         float *end = dst + size*4;
1465         if (stride == sizeof(float[3]))
1466         {
1467                 float *end4 = dst + (size&~3)*4;        
1468                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1469                 {
1470                         while (dst < end4)
1471                         {
1472                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1473                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dst += 16;
1486                                 src += 4*sizeof(float[3]);
1487                         }
1488                 }
1489                 else
1490                 {
1491                         while (dst < end4)
1492                         {
1493                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dst += 16;
1507                                 src += 4*sizeof(float[3]);
1508                         }
1509                 }
1510         }
1511         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1512         {
1513                 while (dst < end)
1514                 {
1515                         __m128 v = _mm_loadu_ps((const float *)src);
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519                         _mm_store_ps(dst, v);
1520                         dst += 4;
1521                         src += stride;
1522                 }
1523         }
1524         else
1525         {
1526                 while (dst < end)
1527                 {
1528                         __m128 v = _mm_load_ps((const float *)src);
1529                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532                         _mm_store_ps(dst, v);
1533                         dst += 4;
1534                         src += stride;
1535                 }
1536         }
1537 }
1538
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1540 {
1541         float *end = dst + size*4;
1542         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543         if (stride == sizeof(float[2]))
1544         {
1545                 float *end2 = dst + (size&~1)*4;
1546                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1547                 {
1548                         while (dst < end2)
1549                         {
1550                                 __m128 v = _mm_loadu_ps((const float *)src);
1551                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553                                 dst += 8;
1554                                 src += 2*sizeof(float[2]);
1555                         }
1556                 }
1557                 else
1558                 {
1559                         while (dst < end2)
1560                         {
1561                                 __m128 v = _mm_load_ps((const float *)src);
1562                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1564                                 dst += 8;
1565                                 src += 2*sizeof(float[2]);
1566                         }
1567                 }
1568         }
1569         while (dst < end)
1570         {
1571                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1572                 dst += 4;
1573                 src += stride;
1574         }
1575 }
1576
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 {
1579         float *end = dst + size*4;
1580         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581         if (stride == sizeof(unsigned char[4]))
1582         {
1583                 float *end4 = dst + (size&~3)*4;
1584                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585                 {
1586                         while (dst < end4)
1587                         {
1588                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1593                                 dst += 16;
1594                                 src += 4*sizeof(unsigned char[4]);
1595                         }
1596                 }
1597                 else
1598                 {
1599                         while (dst < end4)
1600                         {
1601                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1606                                 dst += 16;
1607                                 src += 4*sizeof(unsigned char[4]);
1608                         }
1609                 }
1610         }
1611         while (dst < end)
1612         {
1613                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1615                 dst += 4;
1616                 src += stride;
1617         }
1618 }
1619
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1621 {
1622         float *end = dst + 4*size;
1623         __m128 v = _mm_loadu_ps(src);
1624         while (dst < end)
1625         {
1626                 _mm_store_ps(dst, v);
1627                 dst += 4;
1628         }
1629 }
1630 #endif
1631
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1633 {
1634 #ifdef SSE_POSSIBLE
1635         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636         __m128 m0, m1, m2, m3;
1637         float *end;
1638         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1639         {
1640                 // fast case for identity matrix
1641                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1642                 return;
1643         }
1644         end = out4f + numitems*4;
1645         m0 = _mm_loadu_ps(inmatrix16f);
1646         m1 = _mm_loadu_ps(inmatrix16f + 4);
1647         m2 = _mm_loadu_ps(inmatrix16f + 8);
1648         m3 = _mm_loadu_ps(inmatrix16f + 12);
1649         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1650         {
1651                 while (out4f < end)
1652                 {
1653                         __m128 v = _mm_loadu_ps(in4f);
1654                         _mm_store_ps(out4f,
1655                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659                         out4f += 4;
1660                         in4f += 4;
1661                 }
1662         }
1663         else
1664         {
1665                 while (out4f < end)
1666                 {
1667                         __m128 v = _mm_load_ps(in4f);
1668                         _mm_store_ps(out4f,
1669                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1673                         out4f += 4;
1674                         in4f += 4;
1675                 }
1676         }
1677 #endif
1678 }
1679
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1681 {
1682         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1683 }
1684
1685 #ifdef SSE_POSSIBLE
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1695 { \
1696         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1700 }
1701
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1703 { \
1704         __m128 p = (in); \
1705         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1709 }
1710
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1712 {
1713         int clipmask = 0xFF;
1714         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722         #define BBFRONT(k, pos) \
1723         { \
1724                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1727                 { \
1728                         __m128 proj; \
1729                         clipmask &= ~(1<<k); \
1730                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731                         minproj = _mm_min_ss(minproj, proj); \
1732                         maxproj = _mm_max_ss(maxproj, proj); \
1733                 } \
1734         }
1735         BBFRONT(0, minpos); 
1736         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1737         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1738         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1739         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1740         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1741         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1742         BBFRONT(7, maxpos);
1743         #define BBCLIP(k) \
1744         { \
1745                 if (clipmask&(1<<k)) \
1746                 { \
1747                         if (!(clipmask&(1<<(k^1)))) \
1748                         { \
1749                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752                                 minproj = _mm_min_ss(minproj, proj); \
1753                                 maxproj = _mm_max_ss(maxproj, proj); \
1754                         } \
1755                         if (!(clipmask&(1<<(k^2)))) \
1756                         { \
1757                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760                                 minproj = _mm_min_ss(minproj, proj); \
1761                                 maxproj = _mm_max_ss(maxproj, proj); \
1762                         } \
1763                         if (!(clipmask&(1<<(k^4)))) \
1764                         { \
1765                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768                                 minproj = _mm_min_ss(minproj, proj); \
1769                                 maxproj = _mm_max_ss(maxproj, proj); \
1770                         } \
1771                 } \
1772         }
1773         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780         *starty = _mm_cvttss_si32(maxproj);
1781         *endy = _mm_cvttss_si32(minproj)+1;
1782         return clipmask;
1783 }
1784         
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1786 {
1787         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788         float *end = out4f + numitems*4;
1789         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         __m128 minpos, maxpos;
1791         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1792         {
1793                 minpos = maxpos = _mm_loadu_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_loadu_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         else
1808         {
1809                 minpos = maxpos = _mm_load_ps(in4f);
1810                 while (out4f < end)
1811                 {
1812                         __m128 v = _mm_load_ps(in4f);
1813                         minpos = _mm_min_ps(minpos, v);
1814                         maxpos = _mm_max_ps(maxpos, v);
1815                         _mm_store_ps(out4f, v);
1816                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817                         _mm_store_ps(screen4f, v);
1818                         in4f += 4;
1819                         out4f += 4;
1820                         screen4f += 4;
1821                 }
1822         }
1823         if (starty && endy) 
1824         {
1825                 ALIGN(float minposf[4]);
1826                 ALIGN(float maxposf[4]);
1827                 _mm_store_ps(minposf, minpos);
1828                 _mm_store_ps(maxposf, maxpos);
1829                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830         }
1831         return 0;
1832 }
1833
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1835 {
1836         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1838         float *end;
1839         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841         end = out4f + numitems*4;
1842         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844         m0 = _mm_loadu_ps(inmatrix16f);
1845         m1 = _mm_loadu_ps(inmatrix16f + 4);
1846         m2 = _mm_loadu_ps(inmatrix16f + 8);
1847         m3 = _mm_loadu_ps(inmatrix16f + 12);
1848         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1849         {
1850                 minpos = maxpos = _mm_loadu_ps(in4f);
1851                 while (out4f < end)
1852                 {
1853                         __m128 v = _mm_loadu_ps(in4f);
1854                         minpos = _mm_min_ps(minpos, v);
1855                         maxpos = _mm_max_ps(maxpos, v);
1856                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857                         _mm_store_ps(out4f, v);
1858                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859                         _mm_store_ps(screen4f, v);
1860                         in4f += 4;
1861                         out4f += 4;
1862                         screen4f += 4;
1863                 }
1864         }
1865         else
1866         {
1867                 minpos = maxpos = _mm_load_ps(in4f);
1868                 while (out4f < end)
1869                 {
1870                         __m128 v = _mm_load_ps(in4f);
1871                         minpos = _mm_min_ps(minpos, v);
1872                         maxpos = _mm_max_ps(maxpos, v);
1873                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874                         _mm_store_ps(out4f, v);
1875                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876                         _mm_store_ps(screen4f, v);
1877                         in4f += 4;
1878                         out4f += 4;
1879                         screen4f += 4;
1880                 }
1881         }
1882         if (starty && endy) 
1883         {
1884                 ALIGN(float minposf[4]);
1885                 ALIGN(float maxposf[4]);
1886                 _mm_store_ps(minposf, minpos);
1887                 _mm_store_ps(maxposf, maxpos);
1888                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1889         }
1890         return 0;
1891 }
1892 #endif
1893
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1895 {
1896 #ifdef SSE_POSSIBLE
1897         float *outf = dpsoftrast.post_array4f[outarray];
1898         const unsigned char *inb;
1899         int firstvertex = dpsoftrast.firstvertex;
1900         int numvertices = dpsoftrast.numvertices;
1901         int stride;
1902         switch(inarray)
1903         {
1904         case DPSOFTRAST_ARRAY_POSITION:
1905                 stride = dpsoftrast.stride_vertex;
1906                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1908                 break;
1909         case DPSOFTRAST_ARRAY_COLOR:
1910                 stride = dpsoftrast.stride_color;
1911                 if (dpsoftrast.pointer_color4f)
1912                 {
1913                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1915                 }
1916                 else if (dpsoftrast.pointer_color4ub)
1917                 {
1918                         stride = dpsoftrast.stride_color;
1919                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1921                 }
1922                 else
1923                 {
1924                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1925                 }
1926                 break;
1927         default:
1928                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1930                 {
1931                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1933                         {
1934                         case 2:
1935                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1936                                 break;
1937                         case 3:
1938                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1939                                 break;
1940                         case 4:
1941                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                                 break;
1943                         }
1944                 }
1945                 break;
1946         }
1947         return outf;
1948 #else
1949         return NULL;
1950 #endif
1951 }
1952
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957         return data;
1958 }
1959
1960 #if 0
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1962 {
1963 #ifdef SSE_POSSIBLE
1964         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1966         return data;
1967 #else
1968         return NULL;
1969 #endif
1970 }
1971 #endif
1972
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975 #ifdef SSE_POSSIBLE
1976         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1978         return data;
1979 #else
1980         return NULL;
1981 #endif
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         float wslope = triangle->w[0];
1990         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991         float endz = 1.0f / (w + wslope * startx);
1992         if (triangle->w[0] == 0)
1993         {
1994                 // LordHavoc: fast flat polygons (HUD/menu)
1995                 for (x = startx;x < endx;x++)
1996                         zf[x] = endz;
1997                 return;
1998         }
1999         for (x = startx;x < endx;)
2000         {
2001                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2002                 float z = endz, dz;
2003                 if (nextsub >= endx) nextsub = endsub = endx-1;
2004                 endz = 1.0f / (w + wslope * nextsub);
2005                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2006                 for (; x <= endsub; x++, z += dz)
2007                         zf[x] = z;
2008         }
2009 }
2010
2011 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2012 {
2013         int x;
2014         int startx = span->startx;
2015         int endx = span->endx;
2016         int d[4];
2017         float a, b;
2018         unsigned char * RESTRICT pixelmask = span->pixelmask;
2019         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2020         if (!pixel)
2021                 return;
2022         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2023         // handle alphatest now (this affects depth writes too)
2024         if (thread->alphatest)
2025                 for (x = startx;x < endx;x++)
2026                         if (in4f[x*4+3] < 0.5f)
2027                                 pixelmask[x] = false;
2028         // FIXME: this does not handle bigendian
2029         switch(thread->fb_blendmode)
2030         {
2031         case DPSOFTRAST_BLENDMODE_OPAQUE:
2032                 for (x = startx;x < endx;x++)
2033                 {
2034                         if (!pixelmask[x])
2035                                 continue;
2036                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2037                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2038                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2039                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2040                         pixel[x*4+0] = d[0];
2041                         pixel[x*4+1] = d[1];
2042                         pixel[x*4+2] = d[2];
2043                         pixel[x*4+3] = d[3];
2044                 }
2045                 break;
2046         case DPSOFTRAST_BLENDMODE_ALPHA:
2047                 for (x = startx;x < endx;x++)
2048                 {
2049                         if (!pixelmask[x])
2050                                 continue;
2051                         a = in4f[x*4+3] * 255.0f;
2052                         b = 1.0f - in4f[x*4+3];
2053                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         a = in4f[x*4+3] * 255.0f;
2069                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2070                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2071                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2072                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2073                         pixel[x*4+0] = d[0];
2074                         pixel[x*4+1] = d[1];
2075                         pixel[x*4+2] = d[2];
2076                         pixel[x*4+3] = d[3];
2077                 }
2078                 break;
2079         case DPSOFTRAST_BLENDMODE_ADD:
2080                 for (x = startx;x < endx;x++)
2081                 {
2082                         if (!pixelmask[x])
2083                                 continue;
2084                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2085                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2086                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2087                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2088                         pixel[x*4+0] = d[0];
2089                         pixel[x*4+1] = d[1];
2090                         pixel[x*4+2] = d[2];
2091                         pixel[x*4+3] = d[3];
2092                 }
2093                 break;
2094         case DPSOFTRAST_BLENDMODE_INVMOD:
2095                 for (x = startx;x < endx;x++)
2096                 {
2097                         if (!pixelmask[x])
2098                                 continue;
2099                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2100                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2101                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2102                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2103                         pixel[x*4+0] = d[0];
2104                         pixel[x*4+1] = d[1];
2105                         pixel[x*4+2] = d[2];
2106                         pixel[x*4+3] = d[3];
2107                 }
2108                 break;
2109         case DPSOFTRAST_BLENDMODE_MUL:
2110                 for (x = startx;x < endx;x++)
2111                 {
2112                         if (!pixelmask[x])
2113                                 continue;
2114                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2115                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2116                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2117                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2118                         pixel[x*4+0] = d[0];
2119                         pixel[x*4+1] = d[1];
2120                         pixel[x*4+2] = d[2];
2121                         pixel[x*4+3] = d[3];
2122                 }
2123                 break;
2124         case DPSOFTRAST_BLENDMODE_MUL2:
2125                 for (x = startx;x < endx;x++)
2126                 {
2127                         if (!pixelmask[x])
2128                                 continue;
2129                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2130                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2131                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2132                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2133                         pixel[x*4+0] = d[0];
2134                         pixel[x*4+1] = d[1];
2135                         pixel[x*4+2] = d[2];
2136                         pixel[x*4+3] = d[3];
2137                 }
2138                 break;
2139         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2140                 for (x = startx;x < endx;x++)
2141                 {
2142                         if (!pixelmask[x])
2143                                 continue;
2144                         a = in4f[x*4+3] * -255.0f;
2145                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2146                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2147                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2148                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2149                         pixel[x*4+0] = d[0];
2150                         pixel[x*4+1] = d[1];
2151                         pixel[x*4+2] = d[2];
2152                         pixel[x*4+3] = d[3];
2153                 }
2154                 break;
2155         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2156                 for (x = startx;x < endx;x++)
2157                 {
2158                         if (!pixelmask[x])
2159                                 continue;
2160                         a = 255.0f;
2161                         b = 1.0f - in4f[x*4+3];
2162                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2163                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2164                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2165                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2166                         pixel[x*4+0] = d[0];
2167                         pixel[x*4+1] = d[1];
2168                         pixel[x*4+2] = d[2];
2169                         pixel[x*4+3] = d[3];
2170                 }
2171                 break;
2172         case DPSOFTRAST_BLENDMODE_INVADD:
2173                 for (x = startx;x < endx;x++)
2174                 {
2175                         if (!pixelmask[x])
2176                                 continue;
2177                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2178                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2179                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2180                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2181                         pixel[x*4+0] = d[0];
2182                         pixel[x*4+1] = d[1];
2183                         pixel[x*4+2] = d[2];
2184                         pixel[x*4+3] = d[3];
2185                 }
2186                 break;
2187         }
2188 }
2189
2190 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2191 {
2192 #ifdef SSE_POSSIBLE
2193         int x;
2194         int startx = span->startx;
2195         int endx = span->endx;
2196         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2197         unsigned char * RESTRICT pixelmask = span->pixelmask;
2198         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2199         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2200         if (!pixel)
2201                 return;
2202         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2203         pixeli += span->y * dpsoftrast.fb_width + span->x;
2204         // handle alphatest now (this affects depth writes too)
2205         if (thread->alphatest)
2206                 for (x = startx;x < endx;x++)
2207                         if (in4ub[x*4+3] < 0.5f)
2208                                 pixelmask[x] = false;
2209         // FIXME: this does not handle bigendian
2210         switch(thread->fb_blendmode)
2211         {
2212         case DPSOFTRAST_BLENDMODE_OPAQUE:
2213                 for (x = startx;x + 4 <= endx;)
2214                 {
2215                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2216                         {
2217                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2218                                 x += 4;
2219                         }
2220                         else
2221                         {
2222                                 if (pixelmask[x])
2223                                         pixeli[x] = ini[x];
2224                                 x++;
2225                         }
2226                 }
2227                 for (;x < endx;x++)
2228                         if (pixelmask[x])
2229                                 pixeli[x] = ini[x];
2230                 break;
2231         case DPSOFTRAST_BLENDMODE_ALPHA:
2232         #define FINISHBLEND(blend2, blend1) \
2233                 for (x = startx;x + 1 < endx;x += 2) \
2234                 { \
2235                         __m128i src, dst; \
2236                         switch (*(const unsigned short*)&pixelmask[x]) \
2237                         { \
2238                         case 0x0101: \
2239                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2240                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2241                                 blend2; \
2242                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2243                                 continue; \
2244                         case 0x0100: \
2245                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2246                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2247                                 blend1; \
2248                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2249                                 continue; \
2250                         case 0x0001: \
2251                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2252                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2253                                 blend1; \
2254                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2255                                 continue; \
2256                         } \
2257                         break; \
2258                 } \
2259                 for(;x < endx; x++) \
2260                 { \
2261                         __m128i src, dst; \
2262                         if (!pixelmask[x]) \
2263                                 continue; \
2264                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2265                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2266                         blend1; \
2267                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2268                 }
2269
2270                 FINISHBLEND({
2271                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2273                 }, {
2274                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2276                 });
2277                 break;
2278         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2279                 FINISHBLEND({
2280                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2281                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2282                 }, {
2283                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2284                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2285                 });
2286                 break;
2287         case DPSOFTRAST_BLENDMODE_ADD:
2288                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2289                 break;
2290         case DPSOFTRAST_BLENDMODE_INVMOD:
2291                 FINISHBLEND({
2292                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2293                 }, {
2294                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2295                 });
2296                 break;
2297         case DPSOFTRAST_BLENDMODE_MUL:
2298                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2299                 break;
2300         case DPSOFTRAST_BLENDMODE_MUL2:
2301                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2302                 break;
2303         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2304                 FINISHBLEND({
2305                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2306                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2307                 }, {
2308                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2309                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2310                 });
2311                 break;
2312         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2313                 FINISHBLEND({
2314                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2315                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2316                 }, {
2317                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2318                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2319                 });
2320                 break;
2321         case DPSOFTRAST_BLENDMODE_INVADD:
2322                 FINISHBLEND({
2323                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2324                 }, {
2325                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2326                 });
2327                 break;
2328         }
2329 #endif
2330 }
2331
2332 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2333 {
2334         int x;
2335         int startx = span->startx;
2336         int endx = span->endx;
2337         int flags;
2338         float c[4];
2339         float data[4];
2340         float slope[4];
2341         float tc[2], endtc[2];
2342         float tcscale[2];
2343         unsigned int tci[2];
2344         unsigned int tci1[2];
2345         unsigned int tcimin[2];
2346         unsigned int tcimax[2];
2347         int tciwrapmask[2];
2348         int tciwidth;
2349         int filter;
2350         int mip;
2351         const unsigned char * RESTRICT pixelbase;
2352         const unsigned char * RESTRICT pixel[4];
2353         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2354         // if no texture is bound, just fill it with white
2355         if (!texture)
2356         {
2357                 for (x = startx;x < endx;x++)
2358                 {
2359                         out4f[x*4+0] = 1.0f;
2360                         out4f[x*4+1] = 1.0f;
2361                         out4f[x*4+2] = 1.0f;
2362                         out4f[x*4+3] = 1.0f;
2363                 }
2364                 return;
2365         }
2366         mip = triangle->mip[texunitindex];
2367         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2368         // if this mipmap of the texture is 1 pixel, just fill it with that color
2369         if (texture->mipmap[mip][1] == 4)
2370         {
2371                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2372                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2373                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2374                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2375                 for (x = startx;x < endx;x++)
2376                 {
2377                         out4f[x*4+0] = c[0];
2378                         out4f[x*4+1] = c[1];
2379                         out4f[x*4+2] = c[2];
2380                         out4f[x*4+3] = c[3];
2381                 }
2382                 return;
2383         }
2384         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2385         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2386         flags = texture->flags;
2387         tcscale[0] = texture->mipmap[mip][2];
2388         tcscale[1] = texture->mipmap[mip][3];
2389         tciwidth = texture->mipmap[mip][2];
2390         tcimin[0] = 0;
2391         tcimin[1] = 0;
2392         tcimax[0] = texture->mipmap[mip][2]-1;
2393         tcimax[1] = texture->mipmap[mip][3]-1;
2394         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2395         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2396         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2397         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2398         if (filter)
2399         {
2400                 endtc[0] -= 0.5f;
2401                 endtc[1] -= 0.5f;
2402         }
2403         for (x = startx;x < endx;)
2404         {
2405                 unsigned int subtc[2];
2406                 unsigned int substep[2];
2407                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2408                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2409                 if (nextsub >= endx)
2410                 {
2411                         nextsub = endsub = endx-1;      
2412                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2413                 }
2414                 tc[0] = endtc[0];
2415                 tc[1] = endtc[1];
2416                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2417                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2418                 if (filter)
2419                 {
2420                         endtc[0] -= 0.5f;
2421                         endtc[1] -= 0.5f;
2422                 }
2423                 substep[0] = (endtc[0] - tc[0]) * subscale;
2424                 substep[1] = (endtc[1] - tc[1]) * subscale;
2425                 subtc[0] = tc[0] * (1<<12);
2426                 subtc[1] = tc[1] * (1<<12);
2427                 if (filter)
2428                 {
2429                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2430                         {
2431                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2432                                 {
2433                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2434                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2435                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2436                                         tci[0] = subtc[0]>>12;
2437                                         tci[1] = subtc[1]>>12;
2438                                         tci1[0] = tci[0] + 1;
2439                                         tci1[1] = tci[1] + 1;
2440                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2443                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2444                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2445                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2446                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2447                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2448                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2449                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2450                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2451                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2452                                         out4f[x*4+0] = c[0];
2453                                         out4f[x*4+1] = c[1];
2454                                         out4f[x*4+2] = c[2];
2455                                         out4f[x*4+3] = c[3];
2456                                 }
2457                         }
2458                         else
2459                         {
2460                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2461                                 {
2462                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465                                         tci[0] = subtc[0]>>12;
2466                                         tci[1] = subtc[1]>>12;
2467                                         tci1[0] = tci[0] + 1;
2468                                         tci1[1] = tci[1] + 1;
2469                                         tci[0] &= tciwrapmask[0];
2470                                         tci[1] &= tciwrapmask[1];
2471                                         tci1[0] &= tciwrapmask[0];
2472                                         tci1[1] &= tciwrapmask[1];
2473                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481                                         out4f[x*4+0] = c[0];
2482                                         out4f[x*4+1] = c[1];
2483                                         out4f[x*4+2] = c[2];
2484                                         out4f[x*4+3] = c[3];
2485                                 }
2486                         }
2487                 }
2488                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2489                 {
2490                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2491                         {
2492                                 tci[0] = subtc[0]>>12;
2493                                 tci[1] = subtc[1]>>12;
2494                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2495                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2496                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2497                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2498                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2499                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2500                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2501                                 out4f[x*4+0] = c[0];
2502                                 out4f[x*4+1] = c[1];
2503                                 out4f[x*4+2] = c[2];
2504                                 out4f[x*4+3] = c[3];
2505                         }
2506                 }
2507                 else
2508                 {
2509                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2510                         {
2511                                 tci[0] = subtc[0]>>12;
2512                                 tci[1] = subtc[1]>>12;
2513                                 tci[0] &= tciwrapmask[0];
2514                                 tci[1] &= tciwrapmask[1];
2515                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2517                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2518                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2519                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2520                                 out4f[x*4+0] = c[0];
2521                                 out4f[x*4+1] = c[1];
2522                                 out4f[x*4+2] = c[2];
2523                                 out4f[x*4+3] = c[3];
2524                         }
2525                 }
2526         }
2527 }
2528
2529 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2530 {
2531 #ifdef SSE_POSSIBLE
2532         int x;
2533         int startx = span->startx;
2534         int endx = span->endx;
2535         int flags;
2536         __m128 data, slope, tcscale;
2537         __m128i tcsize, tcmask, tcoffset, tcmax;
2538         __m128 tc, endtc;
2539         __m128i subtc, substep, endsubtc;
2540         int filter;
2541         int mip;
2542         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2543         const unsigned char * RESTRICT pixelbase;
2544         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2545         // if no texture is bound, just fill it with white
2546         if (!texture)
2547         {
2548                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2549                 return;
2550         }
2551         mip = triangle->mip[texunitindex];
2552         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2553         // if this mipmap of the texture is 1 pixel, just fill it with that color
2554         if (texture->mipmap[mip][1] == 4)
2555         {
2556                 unsigned int k = *((const unsigned int *)pixelbase);
2557                 for (x = startx;x < endx;x++)
2558                         outi[x] = k;
2559                 return;
2560         }
2561         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2562         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2563         flags = texture->flags;
2564         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2565         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2566         tcscale = _mm_cvtepi32_ps(tcsize);
2567         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2568         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2569         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2570         if (filter)
2571                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2572         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2573         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2574         tcmax = _mm_packs_epi32(tcmask, tcmask);
2575         for (x = startx;x < endx;)
2576         {
2577                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2578                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2579                 if (nextsub >= endx)
2580                 {
2581                         nextsub = endsub = endx-1;
2582                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2583                 }       
2584                 tc = endtc;
2585                 subtc = endsubtc;
2586                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2587                 if (filter)
2588                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2589                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2590                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2591                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2592                 substep = _mm_slli_epi32(substep, 1);
2593                 if (filter)
2594                 {
2595                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2596                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2597                         {
2598                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2599                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2600                                 {
2601                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2602                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2603                                         tci = _mm_madd_epi16(tci, tcoffset);
2604                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2605                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2606                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2607                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2608                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2609                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2610                                         fracm = _mm_srli_epi16(subtc, 1);
2611                                         pix1 = _mm_add_epi16(pix1,
2612                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2613                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2614                                         pix3 = _mm_add_epi16(pix3,
2615                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2616                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2617                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2618                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2619                                         pix2 = _mm_add_epi16(pix2,
2620                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2621                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2622                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2623                                 }
2624                                 if (x <= endsub)
2625                                 {
2626                                         const unsigned char * RESTRICT ptr1;
2627                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2628                                         tci = _mm_madd_epi16(tci, tcoffset);
2629                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2630                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2631                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2632                                         fracm = _mm_srli_epi16(subtc, 1);
2633                                         pix1 = _mm_add_epi16(pix1,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2637                                         pix1 = _mm_add_epi16(pix1,
2638                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2640                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2641                                         x++;
2642                                 }
2643                         }
2644                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2645                         {
2646                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2647                                 {
2648                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2649                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2650                                         tci = _mm_madd_epi16(tci, tcoffset);
2651                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2652                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2653                                                                                         _mm_setzero_si128());
2654                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2655                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2656                                                                                         _mm_setzero_si128());
2657                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2658                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662                                                                                         _mm_setzero_si128());
2663                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665                                                                                         _mm_setzero_si128());
2666                                         fracm = _mm_srli_epi16(subtc, 1);
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670                                         pix3 = _mm_add_epi16(pix3,
2671                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2672                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2673                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2674                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2675                                         pix2 = _mm_add_epi16(pix2,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2677                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2678                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2679                                 }
2680                                 if (x <= endsub)
2681                                 {
2682                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2683                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2687                                                                                         _mm_setzero_si128());
2688                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2689                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2690                                                                                         _mm_setzero_si128());
2691                                         fracm = _mm_srli_epi16(subtc, 1);
2692                                         pix1 = _mm_add_epi16(pix1,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2696                                         pix1 = _mm_add_epi16(pix1,
2697                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2699                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2700                                         x++;
2701                                 }
2702                         }
2703                         else
2704                         {
2705                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2706                                 {
2707                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2708                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2709                                         tci = _mm_madd_epi16(tci, tcoffset);
2710                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2711                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2712                                                                                         _mm_setzero_si128());
2713                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2714                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2715                                                                                         _mm_setzero_si128());
2716                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2717                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2718                                         tci = _mm_madd_epi16(tci, tcoffset);
2719                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2720                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2721                                                                                         _mm_setzero_si128());
2722                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2723                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2724                                                                                         _mm_setzero_si128());
2725                                         fracm = _mm_srli_epi16(subtc, 1);
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2729                                         pix3 = _mm_add_epi16(pix3,
2730                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2731                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2732                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2733                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2734                                         pix2 = _mm_add_epi16(pix2,
2735                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2736                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2737                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2738                                 }
2739                                 if (x <= endsub)
2740                                 {
2741                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2742                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2743                                         tci = _mm_madd_epi16(tci, tcoffset);
2744                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2745                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2746                                                                                         _mm_setzero_si128());
2747                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2748                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2749                                                                                         _mm_setzero_si128());
2750                                         fracm = _mm_srli_epi16(subtc, 1);
2751                                         pix1 = _mm_add_epi16(pix1,
2752                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2753                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2754                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2755                                         pix1 = _mm_add_epi16(pix1,
2756                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2757                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2758                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2759                                         x++;
2760                                 }
2761                         }
2762                 }
2763                 else
2764                 {
2765                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2766                         {
2767                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2768                                 {
2769                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2770                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2771                                         tci = _mm_madd_epi16(tci, tcoffset);
2772                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2773                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2774                                 }
2775                                 if (x <= endsub)
2776                                 {
2777                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2778                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2779                                         tci = _mm_madd_epi16(tci, tcoffset);
2780                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2781                                         x++;
2782                                 }
2783                         }
2784                         else
2785                         {
2786                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2787                                 {
2788                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789                                         tci = _mm_and_si128(tci, tcmax); 
2790                                         tci = _mm_madd_epi16(tci, tcoffset);
2791                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2793                                 }
2794                                 if (x <= endsub)
2795                                 {
2796                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci = _mm_and_si128(tci, tcmax); 
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         x++;
2801                                 }
2802                         }
2803                 }
2804         }
2805 #endif
2806 }
2807
2808 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2809 {
2810         // TODO: IMPLEMENT
2811         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2812 }
2813
2814 float DPSOFTRAST_SampleShadowmap(const float *vector)
2815 {
2816         // TODO: IMPLEMENT
2817         return 1.0f;
2818 }
2819
2820 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2821 {
2822         int x;
2823         int startx = span->startx;
2824         int endx = span->endx;
2825         float c[4];
2826         float data[4];
2827         float slope[4];
2828         float z;
2829         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2830         for (x = startx;x < endx;x++)
2831         {
2832                 z = zf[x];
2833                 c[0] = (data[0] + slope[0]*x) * z;
2834                 c[1] = (data[1] + slope[1]*x) * z;
2835                 c[2] = (data[2] + slope[2]*x) * z;
2836                 c[3] = (data[3] + slope[3]*x) * z;
2837                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2838                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2839                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2840                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2841         }
2842 }
2843
2844 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2845 {
2846         int x;
2847         int startx = span->startx;
2848         int endx = span->endx;
2849         float c[4];
2850         float data[4];
2851         float slope[4];
2852         float z;
2853         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2854         for (x = startx;x < endx;x++)
2855         {
2856                 z = zf[x];
2857                 c[0] = (data[0] + slope[0]*x) * z;
2858                 c[1] = (data[1] + slope[1]*x) * z;
2859                 c[2] = (data[2] + slope[2]*x) * z;
2860                 c[3] = (data[3] + slope[3]*x) * z;
2861                 out4f[x*4+0] = c[0];
2862                 out4f[x*4+1] = c[1];
2863                 out4f[x*4+2] = c[2];
2864                 out4f[x*4+3] = c[3];
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         float c[4], localcolor[4];
2872         localcolor[0] = subcolor[0];
2873         localcolor[1] = subcolor[1];
2874         localcolor[2] = subcolor[2];
2875         localcolor[3] = subcolor[3];
2876         for (x = startx;x < endx;x++)
2877         {
2878                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2879                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2880                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2881                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2882                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2883                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2884                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2885                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2886         }
2887 }
2888
2889 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2890 {
2891         int x, startx = span->startx, endx = span->endx;
2892         for (x = startx;x < endx;x++)
2893         {
2894                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2895                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2896                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2897                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2898         }
2899 }
2900
2901 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2902 {
2903         int x, startx = span->startx, endx = span->endx;
2904         for (x = startx;x < endx;x++)
2905         {
2906                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2907                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2908                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2909                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2910         }
2911 }
2912
2913 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2914 {
2915         int x, startx = span->startx, endx = span->endx;
2916         float a, b;
2917         for (x = startx;x < endx;x++)
2918         {
2919                 a = 1.0f - inb4f[x*4+3];
2920                 b = inb4f[x*4+3];
2921                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2922                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2923                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2924                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2925         }
2926 }
2927
2928 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2929 {
2930         int x, startx = span->startx, endx = span->endx;
2931         float localcolor[4], ilerp, lerp;
2932         localcolor[0] = color[0];
2933         localcolor[1] = color[1];
2934         localcolor[2] = color[2];
2935         localcolor[3] = color[3];
2936         ilerp = 1.0f - localcolor[3];
2937         lerp = localcolor[3];
2938         for (x = startx;x < endx;x++)
2939         {
2940                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2941                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2942                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2943                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2944         }
2945 }
2946
2947
2948
2949 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2950 {
2951 #ifdef SSE_POSSIBLE
2952         int x;
2953         int startx = span->startx;
2954         int endx = span->endx;
2955         __m128 data, slope;
2956         __m128 mod, endmod;
2957         __m128i submod, substep, endsubmod;
2958         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2959         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2960         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2961         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2962         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2963         for (x = startx; x < endx;)
2964         {
2965                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2966                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2967                 if (nextsub >= endx)
2968                 {
2969                         nextsub = endsub = endx-1;
2970                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2971                 }
2972                 mod = endmod;
2973                 submod = endsubmod;
2974                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2975                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2976                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2977                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2978                 substep = _mm_packs_epi32(substep, substep);
2979                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2980                 {
2981                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2982                         pix = _mm_mulhi_epu16(pix, submod);
2983                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2984                 }
2985                 if (x <= endsub)
2986                 {
2987                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2988                         pix = _mm_mulhi_epu16(pix, submod);
2989                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2990                         x++;
2991                 }
2992         }
2993 #endif
2994 }
2995
2996 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2997 {
2998 #ifdef SSE_POSSIBLE
2999         int x;
3000         int startx = span->startx;
3001         int endx = span->endx;
3002         __m128 data, slope;
3003         __m128 mod, endmod;
3004         __m128i submod, substep, endsubmod;
3005         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3006         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3007         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3008         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3009         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3010         for (x = startx; x < endx;)
3011         {
3012                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3013                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3014                 if (nextsub >= endx)
3015                 {
3016                         nextsub = endsub = endx-1;
3017                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3018                 }
3019                 mod = endmod;
3020                 submod = endsubmod;
3021                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3022                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3023                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3024                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3025                 substep = _mm_packs_epi32(substep, substep);
3026                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3027                 {
3028                         __m128i pix = _mm_srai_epi16(submod, 4);
3029                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3030                 }
3031                 if (x <= endsub)
3032                 {
3033                         __m128i pix = _mm_srai_epi16(submod, 4);
3034                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3035                         x++;
3036                 }
3037         }
3038 #endif
3039 }
3040
3041 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3042 {
3043 #ifdef SSE_POSSIBLE
3044         int x, startx = span->startx, endx = span->endx;
3045         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3046         localcolor = _mm_packs_epi32(localcolor, localcolor);
3047         for (x = startx;x+2 <= endx;x+=2)
3048         {
3049                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3051                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3052                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3053         }
3054         if (x < endx)
3055         {
3056                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3058                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3059                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3060         }
3061 #endif
3062 }
3063
3064 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3065 {
3066 #ifdef SSE_POSSIBLE
3067         int x, startx = span->startx, endx = span->endx;
3068         for (x = startx;x+2 <= endx;x+=2)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3072                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3073                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3074         }
3075         if (x < endx)
3076         {
3077                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3078                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3079                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3080                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3081         }
3082 #endif
3083 }
3084
3085 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3086 {
3087 #ifdef SSE_POSSIBLE
3088         int x, startx = span->startx, endx = span->endx;
3089         for (x = startx;x+2 <= endx;x+=2)
3090         {
3091                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3092                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3093                 pix1 = _mm_add_epi16(pix1, pix2);
3094                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3095         }
3096         if (x < endx)
3097         {
3098                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3099                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3100                 pix1 = _mm_add_epi16(pix1, pix2);
3101                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3102         }
3103 #endif
3104 }
3105
3106 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3107 {
3108 #ifdef SSE_POSSIBLE
3109         int x, startx = span->startx, endx = span->endx;
3110         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3111         tint = _mm_packs_epi32(tint, tint);
3112         for (x = startx;x+2 <= endx;x+=2)
3113         {
3114                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3117                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3118         }
3119         if (x < endx)
3120         {
3121                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3124                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3125         }
3126 #endif
3127 }
3128
3129 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3130 {
3131 #ifdef SSE_POSSIBLE
3132         int x, startx = span->startx, endx = span->endx;
3133         for (x = startx;x+2 <= endx;x+=2)
3134         {
3135                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3138                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3139                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3140         }
3141         if (x < endx)
3142         {
3143                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3146                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3147                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148         }
3149 #endif
3150 }
3151
3152 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3153 {
3154 #ifdef SSE_POSSIBLE
3155         int x, startx = span->startx, endx = span->endx;
3156         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3157         localcolor = _mm_packs_epi32(localcolor, localcolor);
3158         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3159         for (x = startx;x+2 <= endx;x+=2)
3160         {
3161                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3162                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3163                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3164         }
3165         if (x < endx)
3166         {
3167                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3168                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3169                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3170         }
3171 #endif
3172 }
3173
3174
3175
3176 void DPSOFTRAST_VertexShader_Generic(void)
3177 {
3178         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3179         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3180         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3181         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3182                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3183 }
3184
3185 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3186 {
3187         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3188         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3189         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3192         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3193         {
3194                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3195                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3196                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3197                 {
3198                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3199                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3200                         {
3201                                 // multiply
3202                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3203                         }
3204                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3205                         {
3206                                 // add
3207                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3208                         }
3209                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3210                         {
3211                                 // alphablend
3212                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3213                         }
3214                 }
3215         }
3216         else
3217                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3218         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3219 }
3220
3221
3222
3223 void DPSOFTRAST_VertexShader_PostProcess(void)
3224 {
3225         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3227         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3228 }
3229
3230 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3231 {
3232         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3233         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3234         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3237         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3238         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3239         {
3240                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3241                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3242         }
3243         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3244         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3245         {
3246                 // TODO: implement saturation
3247         }
3248         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3249         {
3250                 // TODO: implement gammaramps
3251         }
3252         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3253 }
3254
3255
3256
3257 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3258 {
3259         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3260 }
3261
3262 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3263 {
3264         // this is never called (because colormask is off when this shader is used)
3265         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3266         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3267         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3268         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3269         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_FlatColor(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3278 }
3279
3280 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3281 {
3282 #ifdef SSE_POSSIBLE
3283         unsigned char * RESTRICT pixelmask = span->pixelmask;
3284         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285         int x, startx = span->startx, endx = span->endx;
3286         __m128i Color_Ambientm;
3287         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3288         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3289         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3291         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3292         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3293                 pixel = buffer_FragColorbgra8;
3294         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3295         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3296         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3297         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3298         for (x = startx;x < endx;x++)
3299         {
3300                 __m128i color, pix;
3301                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3302                 {
3303                         __m128i pix2;
3304                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3305                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3306                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3307                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3308                         x += 3;
3309                         continue;
3310                 }
3311                 if (!pixelmask[x])
3312                         continue;
3313                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3314                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3315                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3316         }
3317         if (pixel == buffer_FragColorbgra8)
3318                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3319 #endif
3320 }
3321
3322
3323
3324 void DPSOFTRAST_VertexShader_VertexColor(void)
3325 {
3326         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3327         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3328         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3329 }
3330
3331 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3332 {
3333 #ifdef SSE_POSSIBLE
3334         unsigned char * RESTRICT pixelmask = span->pixelmask;
3335         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3336         int x, startx = span->startx, endx = span->endx;
3337         __m128i Color_Ambientm, Color_Diffusem;
3338         __m128 data, slope;
3339         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3340         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3341         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3342         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3343         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3344         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3345         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3346                 pixel = buffer_FragColorbgra8;
3347         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3348         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3349         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3350         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3351         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3352         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3353         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3354         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3355         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3356         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3357         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3358         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3359         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3360         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3361         {
3362                 __m128i color, mod, pix;
3363                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3364                 {
3365                         __m128i pix2, mod2;
3366                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3367                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3368                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3369                         data = _mm_add_ps(data, slope);
3370                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3371                         data = _mm_add_ps(data, slope);
3372                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3373                         data = _mm_add_ps(data, slope);
3374                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3375                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3376                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3377                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3378                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3379                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3380                         x += 3;
3381                         continue;
3382                 }
3383                 if (!pixelmask[x])
3384                         continue;
3385                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3386                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3387                 mod = _mm_packs_epi32(mod, mod);
3388                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3389                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3390         }
3391         if (pixel == buffer_FragColorbgra8)
3392                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3393 #endif
3394 }
3395
3396
3397
3398 void DPSOFTRAST_VertexShader_Lightmap(void)
3399 {
3400         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3401         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3402         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3403 }
3404
3405 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3406 {
3407 #ifdef SSE_POSSIBLE
3408         unsigned char * RESTRICT pixelmask = span->pixelmask;
3409         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3410         int x, startx = span->startx, endx = span->endx;
3411         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3412         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3413         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3414         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3415         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3416         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3417         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3418         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3419         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3420         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3421                 pixel = buffer_FragColorbgra8;
3422         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3423         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3424         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3425         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3426         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3427         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3428         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3429         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3430         {
3431                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3432                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3433                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3434                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3435                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3436                 for (x = startx;x < endx;x++)
3437                 {
3438                         __m128i color, lightmap, glow, pix;
3439                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3440                         {
3441                                 __m128i pix2;
3442                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3443                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3444                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3445                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3446                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3447                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3448                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3449                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3450                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3451                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3452                                 x += 3;
3453                                 continue;
3454                         }
3455                         if (!pixelmask[x])
3456                                 continue;
3457                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3458                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3459                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3460                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3461                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3462                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3463                 }
3464         }
3465         else
3466         {
3467                 for (x = startx;x < endx;x++)
3468                 {
3469                         __m128i color, lightmap, pix;
3470                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3471                         {
3472                                 __m128i pix2;
3473                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3474                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3475                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3476                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3477                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3478                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3479                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3480                                 x += 3;
3481                                 continue;
3482                         }
3483                         if (!pixelmask[x]) 
3484                                 continue;
3485                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3486                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3487                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3488                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3489                 }
3490         }
3491         if (pixel == buffer_FragColorbgra8)
3492                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3493 #endif
3494 }
3495
3496
3497 void DPSOFTRAST_VertexShader_LightDirection(void);
3498 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3499
3500 void DPSOFTRAST_VertexShader_FakeLight(void)
3501 {
3502         DPSOFTRAST_VertexShader_LightDirection();
3503 }
3504
3505 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3506 {
3507         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3508 }
3509
3510
3511
3512 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3513 {
3514         DPSOFTRAST_VertexShader_LightDirection();
3515         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3516 }
3517
3518 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3519 {
3520         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3521 }
3522
3523
3524
3525 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3526 {
3527         DPSOFTRAST_VertexShader_LightDirection();
3528         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3529 }
3530
3531 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3532 {
3533         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3534 }
3535
3536
3537
3538 void DPSOFTRAST_VertexShader_LightDirection(void)
3539 {
3540         int i;
3541         int numvertices = dpsoftrast.numvertices;
3542         float LightDir[4];
3543         float LightVector[4];
3544         float EyePosition[4];
3545         float EyeVectorModelSpace[4];
3546         float EyeVector[4];
3547         float position[4];
3548         float svector[4];
3549         float tvector[4];
3550         float normal[4];
3551         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3552         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3553         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3554         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3555         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3556         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3557         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3558         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3559         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3560         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3561         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3562         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3563         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3564         for (i = 0;i < numvertices;i++)
3565         {
3566                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3567                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3568                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3569                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3570                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3571                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3572                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3573                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3574                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3575                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3576                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3577                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3578                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3579                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3580                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3581                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3582                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3583                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3584                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3585                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3586                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3587                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3588                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3589                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3590                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3591                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3592                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3593                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3594                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3595         }
3596         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3597 }
3598
3599 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3600 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3601 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3602 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3603 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3604 #define DPSOFTRAST_Vector3Normalize(v)\
3605 do\
3606 {\
3607         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3608         if (len)\
3609         {\
3610                 len = 1.0f / len;\
3611                 v[0] *= len;\
3612                 v[1] *= len;\
3613                 v[2] *= len;\
3614         }\
3615 }\
3616 while(0)
3617
3618 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3619 {
3620         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3621         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3622         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3623         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3624         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3625         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3626         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3627         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3628         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3629         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3630         int x, startx = span->startx, endx = span->endx;
3631         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3632         float LightVectordata[4];
3633         float LightVectorslope[4];
3634         float EyeVectordata[4];
3635         float EyeVectorslope[4];
3636         float VectorSdata[4];
3637         float VectorSslope[4];
3638         float VectorTdata[4];
3639         float VectorTslope[4];
3640         float VectorRdata[4];
3641         float VectorRslope[4];
3642         float z;
3643         float diffusetex[4];
3644         float glosstex[4];
3645         float surfacenormal[4];
3646         float lightnormal[4];
3647         float lightnormal_modelspace[4];
3648         float eyenormal[4];
3649         float specularnormal[4];
3650         float diffuse;
3651         float specular;
3652         float SpecularPower;
3653         int d[4];
3654         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3655         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3656         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3657         Color_Glow[3] = 0.0f;
3658         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3659         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3660         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3661         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3662         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3663         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3664         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3665         Color_Pants[3] = 0.0f;
3666         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3667         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3668         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3669         Color_Shirt[3] = 0.0f;
3670         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3671         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3672         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3673         {
3674                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3675                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3676         }
3677         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3678         {
3679                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3680         }
3681         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3682         {
3683                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3684                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3685                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3686                 Color_Diffuse[3] = 0.0f;
3687                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3688                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3689                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3690                 LightColor[3] = 0.0f;
3691                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3693                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3694                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3695                 Color_Specular[3] = 0.0f;
3696                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3697                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3698                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3699
3700                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3701                 {
3702                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3703                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3704                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3705                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3706                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3707                 }
3708                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3709                 {
3710                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3711                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3712                 }
3713                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3714                 {
3715                         // nothing of this needed
3716                 }
3717                 else
3718                 {
3719                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3720                 }
3721
3722                 for (x = startx;x < endx;x++)
3723                 {
3724                         z = buffer_z[x];
3725                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3726                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3727                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3728                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3729                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3730                         {
3731                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3732                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3733                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3734                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3735                         }
3736                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3737                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3738                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3739                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3740                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3741                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3742                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3743                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3744
3745                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3746                         {
3747                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3748                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3749                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3750                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3751
3752                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3753                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3754                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3755                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3756
3757                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3758                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3759                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3760                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3761
3762                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3763                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3764                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3765                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3766
3767                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3768                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3769
3770                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3771                                 {
3772                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3773                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3774                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3775                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3776                                 }
3777                         }
3778                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3779                         {
3780                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3781                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3782                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3783                                 {
3784                                         float f = 1.0f / 256.0f;
3785                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3786                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3787                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3788                                 }
3789                         }
3790                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3791                         {
3792                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3793                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3794                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3795                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3796
3797                                 LightColor[0] = 1.0;
3798                                 LightColor[1] = 1.0;
3799                                 LightColor[2] = 1.0;
3800                         }
3801                         else
3802                         {
3803                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3804                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3805                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3806                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3807                         }
3808
3809                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3810
3811                         if(thread->shader_exactspecularmath)
3812                         {
3813                                 // reflect lightnormal at surfacenormal, take the negative of that
3814                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3815                                 float f;
3816                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3817                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3818                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3819                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3820
3821                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3822                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3823                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3824                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3825                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3826
3827                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3828                         }
3829                         else
3830                         {
3831                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3832                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3833                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3834                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3835
3836                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3837                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3838                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3839                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3840
3841                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3842                         }
3843
3844                         specular = pow(specular, SpecularPower * glosstex[3]);
3845                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3846                         {
3847                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3848                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3849                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3850                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3851                         }
3852                         else
3853                         {
3854                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3855                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3856                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3857                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3858                         }
3859
3860                         buffer_FragColorbgra8[x*4+0] = d[0];
3861                         buffer_FragColorbgra8[x*4+1] = d[1];
3862                         buffer_FragColorbgra8[x*4+2] = d[2];
3863                         buffer_FragColorbgra8[x*4+3] = d[3];
3864                 }
3865         }
3866         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3867         {
3868                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3869                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3870                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3871                 Color_Diffuse[3] = 0.0f;
3872                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3873                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3874                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3875                 LightColor[3] = 0.0f;
3876                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3877
3878                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3879                 {
3880                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3881                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3882                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3883                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3884                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3885                 }
3886                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3887                 {
3888                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3889                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3890                 }
3891                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3892                 {
3893                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3894                 }
3895                 else
3896                 {
3897                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3898                 }
3899
3900                 for (x = startx;x < endx;x++)
3901                 {
3902                         z = buffer_z[x];
3903                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3904                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3905                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3906                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3907                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3908                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3909                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3910                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3911
3912                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3913                         {
3914                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3915                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3916                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3917                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3918
3919                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3920                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3921                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3922                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3923
3924                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3925                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3926                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3927                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3928
3929                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3930                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3931                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3932                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3933
3934                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3935                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3936
3937                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3938                                 {
3939                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3940                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3941                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3942                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3943                                 }
3944                         }
3945                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3946                         {
3947                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3948                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3949                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3950                                 {
3951                                         float f = 1.0f / 256.0f;
3952                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3953                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3954                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3955                                 }
3956                         }
3957                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3958                         {
3959                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3960                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3961                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3962                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3963
3964                                 LightColor[0] = 1.0;
3965                                 LightColor[1] = 1.0;
3966                                 LightColor[2] = 1.0;
3967                         }
3968                         else
3969                         {
3970                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3971                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3972                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3973                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3974                         }
3975
3976                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3978                         {
3979                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3980                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3981                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3982                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3983                         }
3984                         else
3985                         {
3986                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3987                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3988                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3989                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3990                         }
3991                         buffer_FragColorbgra8[x*4+0] = d[0];
3992                         buffer_FragColorbgra8[x*4+1] = d[1];
3993                         buffer_FragColorbgra8[x*4+2] = d[2];
3994                         buffer_FragColorbgra8[x*4+3] = d[3];
3995                 }
3996         }
3997         else
3998         {
3999                 for (x = startx;x < endx;x++)
4000                 {
4001                         z = buffer_z[x];
4002                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4003                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4004                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4005                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4006
4007                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4008                         {
4009                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4010                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4011                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4012                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4013                         }
4014                         else
4015                         {
4016                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4017                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4018                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4019                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4020                         }
4021                         buffer_FragColorbgra8[x*4+0] = d[0];
4022                         buffer_FragColorbgra8[x*4+1] = d[1];
4023                         buffer_FragColorbgra8[x*4+2] = d[2];
4024                         buffer_FragColorbgra8[x*4+3] = d[3];
4025                 }
4026         }
4027         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4028 }
4029
4030
4031
4032 void DPSOFTRAST_VertexShader_LightSource(void)
4033 {
4034         int i;
4035         int numvertices = dpsoftrast.numvertices;
4036         float LightPosition[4];
4037         float LightVector[4];
4038         float LightVectorModelSpace[4];
4039         float EyePosition[4];
4040         float EyeVectorModelSpace[4];
4041         float EyeVector[4];
4042         float position[4];
4043         float svector[4];
4044         float tvector[4];
4045         float normal[4];
4046         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4047         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4048         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4049         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4050         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4051         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4052         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4053         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4054         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4055         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4056         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4057         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4058         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4059         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4060         for (i = 0;i < numvertices;i++)
4061         {
4062                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4063                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4064                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4065                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4066                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4067                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4068                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4069                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4070                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4071                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4072                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4073                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4074                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4075                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4076                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4077                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4078                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4079                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4080                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4081                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4082                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4083                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4084                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4085                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4086                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4087                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4088                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4089                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4090                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4091                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4092                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4093                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4094         }
4095         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4096         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4097 }
4098
4099 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4100 {
4101 #ifdef SSE_POSSIBLE
4102         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4103         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4104         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4105         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4106         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4107         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4108         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4109         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110         int x, startx = span->startx, endx = span->endx;
4111         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4112         float CubeVectordata[4];
4113         float CubeVectorslope[4];
4114         float LightVectordata[4];
4115         float LightVectorslope[4];
4116         float EyeVectordata[4];
4117         float EyeVectorslope[4];
4118         float z;
4119         float diffusetex[4];
4120         float glosstex[4];
4121         float surfacenormal[4];
4122         float lightnormal[4];
4123         float eyenormal[4];
4124         float specularnormal[4];
4125         float diffuse;
4126         float specular;
4127         float SpecularPower;
4128         float CubeVector[4];
4129         float attenuation;
4130         int d[4];
4131         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4132         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4133         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4134         Color_Glow[3] = 0.0f;
4135         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4136         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4137         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4138         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4139         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4140         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4141         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4142         Color_Diffuse[3] = 0.0f;
4143         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4144         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4145         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4146         Color_Specular[3] = 0.0f;
4147         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4148         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4149         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4150         Color_Pants[3] = 0.0f;
4151         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4152         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4153         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4154         Color_Shirt[3] = 0.0f;
4155         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4156         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4157         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4158         LightColor[3] = 0.0f;
4159         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4160         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4161         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4162         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4163         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4164         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4165         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4166         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4167         {
4168                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4169                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4170         }
4171         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4172                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4173         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4174         {
4175                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4176                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4177                 for (x = startx;x < endx;x++)
4178                 {
4179                         z = buffer_z[x];
4180                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4181                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4182                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4183                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4184                         if (attenuation < 0.01f)
4185                                 continue;
4186                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4187                         {
4188                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4189                                 if (attenuation < 0.01f)
4190                                         continue;
4191                         }
4192
4193                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4194                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4195                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4196                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4197                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4198                         {
4199                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4200                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4201                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4202                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4203                         }
4204                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4205                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4206                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4207                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4208                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4209                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4210                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4211                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4212
4213                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4214                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4215                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4216                         DPSOFTRAST_Vector3Normalize(lightnormal);
4217
4218                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4219
4220                         if(thread->shader_exactspecularmath)
4221                         {
4222                                 // reflect lightnormal at surfacenormal, take the negative of that
4223                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4224                                 float f;
4225                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4226                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4227                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4228                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4229
4230                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4231                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4232                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4233                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4234                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4235
4236                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4237                         }
4238                         else
4239                         {
4240                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4241                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4242                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4243                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4244
4245                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4246                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4247                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4248                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4249
4250                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4251                         }
4252                         specular = pow(specular, SpecularPower * glosstex[3]);
4253
4254                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4255                         {
4256                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4257                                 attenuation *= (1.0f / 255.0f);
4258                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4259                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4260                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4261                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4262                         }
4263                         else
4264                         {
4265                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4266                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4267                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4268                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4269                         }
4270                         buffer_FragColorbgra8[x*4+0] = d[0];
4271                         buffer_FragColorbgra8[x*4+1] = d[1];
4272                         buffer_FragColorbgra8[x*4+2] = d[2];
4273                         buffer_FragColorbgra8[x*4+3] = d[3];
4274                 }
4275         }
4276         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4277         {
4278                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4279                 for (x = startx;x < endx;x++)
4280                 {
4281                         z = buffer_z[x];
4282                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4283                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4284                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4285                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4286                         if (attenuation < 0.01f)
4287                                 continue;
4288                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4289                         {
4290                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4291                                 if (attenuation < 0.01f)
4292                                         continue;
4293                         }
4294
4295                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4296                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4297                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4298                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4299                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4300                         {
4301                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4302                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4303                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4304                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4305                         }
4306                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4307                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4308                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4309                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4310
4311                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4312                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4313                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4314                         DPSOFTRAST_Vector3Normalize(lightnormal);
4315
4316                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4317                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4318                         {
4319                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4320                                 attenuation *= (1.0f / 255.0f);
4321                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4322                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4323                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4324                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4325                         }
4326                         else
4327                         {
4328                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4329                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4330                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4331                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4332                         }
4333                         buffer_FragColorbgra8[x*4+0] = d[0];
4334                         buffer_FragColorbgra8[x*4+1] = d[1];
4335                         buffer_FragColorbgra8[x*4+2] = d[2];
4336                         buffer_FragColorbgra8[x*4+3] = d[3];
4337                 }
4338         }
4339         else
4340         {
4341                 for (x = startx;x < endx;x++)
4342                 {
4343                         z = buffer_z[x];
4344                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4345                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4346                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4347                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4348                         if (attenuation < 0.01f)
4349                                 continue;
4350                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4351                         {
4352                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4353                                 if (attenuation < 0.01f)
4354                                         continue;
4355                         }
4356
4357                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4358                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4359                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4360                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4361                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4362                         {
4363                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4364                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4365                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4366                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4367                         }
4368                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4369                         {
4370                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4371                                 attenuation *= (1.0f / 255.0f);
4372                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4373                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4374                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4375                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4376                         }
4377                         else
4378                         {
4379                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4380                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4381                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4382                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4383                         }
4384                         buffer_FragColorbgra8[x*4+0] = d[0];
4385                         buffer_FragColorbgra8[x*4+1] = d[1];
4386                         buffer_FragColorbgra8[x*4+2] = d[2];
4387                         buffer_FragColorbgra8[x*4+3] = d[3];
4388                 }
4389         }
4390         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4391 #endif
4392 }
4393
4394
4395
4396 void DPSOFTRAST_VertexShader_Refraction(void)
4397 {
4398         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4399         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4400         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4401 }
4402
4403 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4404 {
4405         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4406
4407         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4408         float z;
4409         int x, startx = span->startx, endx = span->endx;
4410
4411         // texture reads
4412         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4413         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4414
4415         // varyings
4416         float ModelViewProjectionPositiondata[4];
4417         float ModelViewProjectionPositionslope[4];
4418
4419         // uniforms
4420         float ScreenScaleRefractReflect[2];
4421         float ScreenCenterRefractReflect[2];
4422         float DistortScaleRefractReflect[2];
4423         float RefractColor[4];
4424
4425         const unsigned char * RESTRICT pixelbase;
4426         const unsigned char * RESTRICT pixel[4];
4427         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4428         if(!texture) return;
4429         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4430
4431         // read textures
4432         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4433         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4434
4435         // read varyings
4436         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4437
4438         // read uniforms
4439         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4440         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4441         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4442         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4443         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4444         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4445         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4446         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4447         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4448         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4449
4450         // do stuff
4451         for (x = startx;x < endx;x++)
4452         {
4453                 float SafeScreenTexCoord[2];
4454                 float ScreenTexCoord[2];
4455                 float v[3];
4456                 float iw;
4457                 unsigned char c[4];
4458
4459                 z = buffer_z[x];
4460
4461                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4462                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4463         
4464                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4465                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4466                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4467
4468                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4469                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4470                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4471                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4472                 DPSOFTRAST_Vector3Normalize(v);
4473                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4474                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4475
4476                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4477                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4478                 {
4479                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4480                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4481                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4482                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4483                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4484                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4485                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4486                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4487                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4488                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4489                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4490                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4491                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4492                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4493                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4494                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4495                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4496                 }
4497                 else
4498                 {
4499                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4500                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4501                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4502                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4503                         c[0] = pixel[0][0];
4504                         c[1] = pixel[0][1];
4505                         c[2] = pixel[0][2];
4506                 }
4507
4508                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4509                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4510                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4511                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4512                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4513         }
4514
4515         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4516 }
4517
4518
4519
4520 void DPSOFTRAST_VertexShader_Water(void)
4521 {
4522         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4523 }
4524
4525
4526 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4527 {
4528         // TODO: IMPLEMENT
4529         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4534 }
4535
4536
4537
4538 void DPSOFTRAST_VertexShader_ShowDepth(void)
4539 {
4540         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4541 }
4542
4543 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4544 {
4545         // TODO: IMPLEMENT
4546         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4547         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4548         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4549         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4550         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4551 }
4552
4553
4554
4555 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4556 {
4557         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4558 }
4559
4560 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4561 {
4562         // TODO: IMPLEMENT
4563         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4564         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4565         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4566         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4567         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4568 }
4569
4570
4571
4572 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4573 {
4574         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4575 }
4576
4577 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4578 {
4579         // TODO: IMPLEMENT
4580         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4581         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4582         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4583         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4584         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4585 }
4586
4587
4588
4589 typedef struct DPSOFTRAST_ShaderModeInfo_s
4590 {
4591         int lodarrayindex;
4592         void (*Vertex)(void);
4593         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4594         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4595         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4596 }
4597 DPSOFTRAST_ShaderModeInfo;
4598
4599 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4600 {
4601         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4602         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4603         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4604         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4605         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4606         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4607         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4608         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4609         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4610         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4611         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4612         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4613         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4614         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4615         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4616         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4617 };
4618
4619 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4620 {
4621         int i;
4622         int x;
4623         int startx;
4624         int endx;
4625 //      unsigned int c;
4626 //      unsigned int *colorpixel;
4627         unsigned int *depthpixel;
4628         float w;
4629         float wslope;
4630         int depth;
4631         int depthslope;
4632         unsigned int d;
4633         DPSOFTRAST_State_Triangle *triangle;
4634         DPSOFTRAST_State_Span *span;
4635         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4636         for (i = 0; i < thread->numspans; i++)
4637         {
4638                 span = &thread->spans[i];
4639                 triangle = &thread->triangles[span->triangle];
4640                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4641                 {
4642                         wslope = triangle->w[0];
4643                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4644                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4645                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4646                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4647                         startx = span->startx;
4648                         endx = span->endx;
4649                         switch(thread->fb_depthfunc)
4650                         {
4651                         default:
4652                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4653                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4654                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4655                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4656                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4657                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4658                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4659                         }
4660                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4661                         //for (x = startx;x < endx;x++)
4662                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4663                         // if there is no color buffer, skip pixel shader
4664                         while (startx < endx && !pixelmask[startx])
4665                                 startx++;
4666                         while (endx > startx && !pixelmask[endx-1])
4667                                 endx--;
4668                         if (startx >= endx)
4669                                 continue; // no pixels to fill
4670                         span->pixelmask = pixelmask;
4671                         span->startx = startx;
4672                         span->endx = endx;
4673                         // run pixel shader if appropriate
4674                         // do this before running depthmask code, to allow the pixelshader
4675                         // to clear pixelmask values for alpha testing
4676                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4677                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4678                         if (thread->depthmask)
4679                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4680                                         if (pixelmask[x])
4681                                                 depthpixel[x] = d;
4682                 }
4683                 else
4684                 {
4685                         // no depth testing means we're just dealing with color...
4686                         // if there is no color buffer, skip pixel shader
4687                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4688                         {
4689                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4690                                 span->pixelmask = pixelmask;
4691                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4692                         }
4693                 }
4694         }
4695         thread->numspans = 0;
4696 }
4697
4698 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4699
4700 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4701 {
4702 #ifdef SSE_POSSIBLE
4703         int cullface = thread->cullface;
4704         int minx, maxx, miny, maxy;
4705         int miny1, maxy1, miny2, maxy2;
4706         __m128i fbmin, fbmax;
4707         __m128 viewportcenter, viewportscale;
4708         int firstvertex = command->firstvertex;
4709         int numvertices = command->numvertices;
4710         int numtriangles = command->numtriangles;
4711         const int *element3i = command->element3i;
4712         const unsigned short *element3s = command->element3s;
4713         int clipped = command->clipped;
4714         int i;
4715         int j;
4716         int k;
4717         int y;
4718         int e[3];
4719         __m128i screeny;
4720         int starty, endy, bandy;
4721         int numpoints;
4722         int clipcase;
4723         float clipdist[4];
4724         __m128 triangleedge1, triangleedge2, trianglenormal;
4725         __m128 clipfrac[3];
4726         __m128 screen[4];
4727         DPSOFTRAST_State_Triangle *triangle;
4728         DPSOFTRAST_Texture *texture;
4729         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4730         miny = thread->fb_scissor[1];
4731         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4732         miny1 = bound(miny, thread->miny1, maxy);
4733         maxy1 = bound(miny, thread->maxy1, maxy);
4734         miny2 = bound(miny, thread->miny2, maxy);
4735         maxy2 = bound(miny, thread->maxy2, maxy);
4736         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4737         {
4738                 if (!ATOMIC_DECREMENT(command->refcount))
4739                 {
4740                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4741                                 MM_FREE(command->arrays);
4742                 }
4743                 return;
4744         }
4745         minx = thread->fb_scissor[0];
4746         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4747         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4748         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4749         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4750         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4751         screen[3] = _mm_setzero_ps();
4752         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4753         for (i = 0;i < numtriangles;i++)
4754         {
4755                 const float *screencoord4f = command->arrays;
4756                 const float *arrays = screencoord4f + numvertices*4;
4757
4758                 // generate the 3 edges of this triangle
4759                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4760                 if (element3s)
4761                 {
4762                         e[0] = element3s[i*3+0] - firstvertex;
4763                         e[1] = element3s[i*3+1] - firstvertex;
4764                         e[2] = element3s[i*3+2] - firstvertex;
4765                 }
4766                 else if (element3i)
4767                 {
4768                         e[0] = element3i[i*3+0] - firstvertex;
4769                         e[1] = element3i[i*3+1] - firstvertex;
4770                         e[2] = element3i[i*3+2] - firstvertex;
4771                 }
4772                 else
4773                 {
4774                         e[0] = i*3+0;
4775                         e[1] = i*3+1;
4776                         e[2] = i*3+2;
4777                 }
4778
4779 #define SKIPBACKFACE \
4780                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4781                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4782                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4783                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4784                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4785                 switch(cullface) \
4786                 { \
4787                 case GL_BACK: \
4788                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4789                                 continue; \
4790                         break; \
4791                 case GL_FRONT: \
4792                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4793                                 continue; \
4794                         break; \
4795                 }
4796
4797 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4798                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4799                         { \
4800                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4801                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4802                         }
4803 #define CLIPPEDVERTEXCOPY(k,p1) \
4804                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4805
4806 #define GENATTRIBCOPY(attrib, p1) \
4807                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4808 #define GENATTRIBLERP(attrib, p1, p2) \
4809                 { \
4810                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4811                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4812                 }
4813 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4814                 switch(clipcase) \
4815                 { \
4816                 default: \
4817                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4818                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4819                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4820                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4821                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4822                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4823                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4824                 }
4825
4826                 if (! clipped)
4827                         goto notclipped;
4828
4829                 // calculate distance from nearplane
4830                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4831                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4832                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4833                 if (clipdist[0] >= 0.0f)
4834                 {
4835                         if (clipdist[1] >= 0.0f)
4836                         {
4837                                 if (clipdist[2] >= 0.0f)
4838                                 {
4839                                 notclipped:
4840                                         // triangle is entirely in front of nearplane
4841                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4842                                         SKIPBACKFACE;
4843                                         numpoints = 3;
4844                                         clipcase = 0;
4845                                 }
4846                                 else
4847                                 {
4848                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4849                                         SKIPBACKFACE;
4850                                         numpoints = 4;
4851                                         clipcase = 1;
4852                                 }
4853                         }
4854                         else
4855                         {
4856                                 if (clipdist[2] >= 0.0f)
4857                                 {
4858                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4859                                         SKIPBACKFACE;
4860                                         numpoints = 4;
4861                                         clipcase = 2;
4862                                 }
4863                                 else
4864                                 {
4865                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4866                                         SKIPBACKFACE;
4867                                         numpoints = 3;
4868                                         clipcase = 3;
4869                                 }
4870                         }
4871                 }
4872                 else if (clipdist[1] >= 0.0f)
4873                 {
4874                         if (clipdist[2] >= 0.0f)
4875                         {
4876                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4877                                 SKIPBACKFACE;
4878                                 numpoints = 4;
4879                                 clipcase = 4;
4880                         }
4881                         else
4882                         {
4883                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4884                                 SKIPBACKFACE;
4885                                 numpoints = 3;
4886                                 clipcase = 5;
4887                         }
4888                 }
4889                 else if (clipdist[2] >= 0.0f)
4890                 {
4891                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4892                         SKIPBACKFACE;
4893                         numpoints = 3;
4894                         clipcase = 6;
4895                 }
4896                 else continue; // triangle is entirely behind nearplane
4897
4898                 {
4899                         // calculate integer y coords for triangle points
4900                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4901                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4902                                         screenmin = _mm_min_epi16(screeni, screenir),
4903                                         screenmax = _mm_max_epi16(screeni, screenir);
4904                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4905                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4906                         screenmin = _mm_max_epi16(screenmin, fbmin);
4907                         screenmax = _mm_min_epi16(screenmax, fbmax);
4908                         // skip offscreen triangles
4909                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4910                                 continue;
4911                         starty = _mm_extract_epi16(screenmin, 1);
4912                         endy = _mm_extract_epi16(screenmax, 1)+1;
4913                         if (starty >= maxy1 && endy <= miny2)
4914                                 continue;
4915                         screeny = _mm_srai_epi32(screeni, 16);
4916                 }
4917
4918                 triangle = &thread->triangles[thread->numtriangles];
4919
4920                 // calculate attribute plans for triangle data...
4921                 // okay, this triangle is going to produce spans, we'd better project
4922                 // the interpolants now (this is what gives perspective texturing),
4923                 // this consists of simply multiplying all arrays by the W coord
4924                 // (which is basically 1/Z), which will be undone per-pixel
4925                 // (multiplying by Z again) to get the perspective-correct array
4926                 // values
4927                 {
4928                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4929                         __m128 mipedgescale, mipdensity;
4930                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4931                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4932                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4933                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4934                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4935                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4936                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4937                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4938                         attribedge1 = _mm_sub_ss(w0, w1);
4939                         attribedge2 = _mm_sub_ss(w2, w1);
4940                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4941                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4942                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4943                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4944                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4945                         _mm_store_ss(&triangle->w[0], attribxslope);
4946                         _mm_store_ss(&triangle->w[1], attribyslope);
4947                         _mm_store_ss(&triangle->w[2], attriborigin);
4948                         mipedgescale = _mm_setzero_ps();
4949                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4950                         {
4951                                 __m128 attrib0, attrib1, attrib2;
4952                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4953                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4954                                         break;
4955                                 arrays += numvertices*4;
4956                                 GENATTRIBS(attrib0, attrib1, attrib2);
4957                                 attriborigin = _mm_mul_ps(attrib1, w1);
4958                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4959                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4960                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4961                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4962                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4963                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4964                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4965                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4966                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4967                                 {
4968                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4969                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4970                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4971                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4972                                 }
4973                         }
4974
4975                         memset(triangle->mip, 0, sizeof(triangle->mip));
4976                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4977                         {
4978                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4979                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4980                                         break;
4981                                 texture = thread->texbound[texunit];
4982                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4983                                 {
4984                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4985                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4986                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4987                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4988                                         // this will be multiplied in the texturing routine by the texture resolution
4989                                         y = _mm_cvtss_si32(mipdensity);
4990                                         if (y > 0)
4991                                         {
4992                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4993                                                 if (y > texture->mipmaps - 1)
4994                                                         y = texture->mipmaps - 1;
4995                                                 triangle->mip[texunit] = y;
4996                                         }
4997                                 }
4998                         }
4999                 }
5000         
5001                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5002                 for (; y < bandy;)
5003                 {
5004                         __m128 xcoords, xslope;
5005                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5006                         int yccmask = _mm_movemask_epi8(ycc);
5007                         int edge0p, edge0n, edge1p, edge1n;
5008                         int nexty;
5009                         if (numpoints == 4)
5010                         {
5011                                 switch(yccmask)
5012                                 {
5013                                 default:
5014                                 case 0xFFFF: /*0000*/ y = endy; continue;
5015                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5016                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5017                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5018                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5019                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5020                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5021                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5022                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5023                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5024                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5025                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5026                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5027                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5028                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5029                                 case 0x0000: /*1111*/ y++; continue;
5030                                 }
5031                         }
5032                         else
5033                         {
5034                                 switch(yccmask)
5035                                 {
5036                                 default:
5037                                 case 0xFFFF: /*000*/ y = endy; continue;
5038                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5039                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5040                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5041                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5042                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5043                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5044                                 case 0x0000: /*111*/ y++; continue;
5045                                 }
5046                         }
5047                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5048                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5049                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5050                         nexty = _mm_extract_epi16(ycc, 0);
5051                         if (nexty >= bandy) nexty = bandy-1;
5052                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5053                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5054                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5055                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5056                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5057                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5058                         {
5059                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5060                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5061                         }
5062                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5063                         {
5064                                 int startx, endx, offset;
5065                                 startx = _mm_cvtss_si32(xcoords);
5066                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5067                                 if (startx < minx) 
5068                                 {
5069                                         if (startx < 0) startx = 0;
5070                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5071                                 }
5072                                 if (endx > maxx) endx = maxx;
5073                                 if (startx >= endx) continue;
5074                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5075                                 {
5076                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5077                                         span->triangle = thread->numtriangles;
5078                                         span->x = offset;
5079                                         span->y = y;
5080                                         span->startx = max(minx - offset, 0);
5081                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5082                                         if (span->startx >= span->endx)
5083                                                 continue; 
5084                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5085                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5086                                 }
5087                         }
5088                 }
5089
5090                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5091                 {
5092                         DPSOFTRAST_Draw_ProcessSpans(thread);
5093                         thread->numtriangles = 0;
5094                 }
5095         }
5096
5097         if (!ATOMIC_DECREMENT(command->refcount))
5098         {
5099                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5100                         MM_FREE(command->arrays);
5101         }
5102
5103         if (thread->numspans > 0 || thread->numtriangles > 0)
5104         {
5105                 DPSOFTRAST_Draw_ProcessSpans(thread);
5106                 thread->numtriangles = 0;
5107         }
5108 #endif
5109 }
5110
5111 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5112 {
5113         int i;
5114         int j;
5115         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5116         int datasize = 2*numvertices*sizeof(float[4]);
5117         DPSOFTRAST_Command_Draw *command;
5118         unsigned char *data;
5119         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5120         {
5121                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5122                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5123                         break;
5124                 datasize += numvertices*sizeof(float[4]);
5125         }
5126         if (element3s)
5127                 datasize += numtriangles*sizeof(unsigned short[3]);
5128         else if (element3i)
5129                 datasize += numtriangles*sizeof(int[3]);
5130         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5131         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5132         {
5133                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5134                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5135         }
5136         else
5137         {
5138                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5139                 data = (unsigned char *)command + commandsize;
5140         }
5141         command->firstvertex = firstvertex;
5142         command->numvertices = numvertices;
5143         command->numtriangles = numtriangles;
5144         command->arrays = (float *)data;
5145         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5146         dpsoftrast.firstvertex = firstvertex;
5147         dpsoftrast.numvertices = numvertices;
5148         dpsoftrast.screencoord4f = (float *)data;
5149         data += numvertices*sizeof(float[4]);
5150         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5151         data += numvertices*sizeof(float[4]);
5152         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5153         {
5154                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5155                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5156                         break;
5157                 dpsoftrast.post_array4f[j] = (float *)data;
5158                 data += numvertices*sizeof(float[4]);
5159         }
5160         command->element3i = NULL;
5161         command->element3s = NULL;
5162         if (element3s)
5163         {
5164                 command->element3s = (unsigned short *)data;
5165                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5166         }
5167         else if (element3i)
5168         {
5169                 command->element3i = (int *)data;
5170                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5171         }
5172         return command;
5173 }
5174
5175 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5176 {
5177         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5178         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5179         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5180         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5181         if (command->starty >= command->endy)
5182         {
5183                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5184                         MM_FREE(command->arrays);
5185                 DPSOFTRAST_UndoCommand(command->commandsize);
5186                 return;
5187         }
5188         command->clipped = dpsoftrast.drawclipped;
5189         command->refcount = dpsoftrast.numthreads;
5190
5191         if (dpsoftrast.usethreads)
5192         {
5193                 int i;
5194                 DPSOFTRAST_Draw_SyncCommands();
5195                 for (i = 0; i < dpsoftrast.numthreads; i++)
5196                 {
5197                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5198                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5199                                 Thread_CondSignal(thread->drawcond);
5200                 }
5201         }
5202         else
5203         {
5204                 DPSOFTRAST_Draw_FlushThreads();
5205         }
5206 }
5207
5208 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5209 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5210 {
5211         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5212 }
5213 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5214 {
5215         DPSOFTRAST_Command_SetRenderTargets *command;
5216         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5217                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5218                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5219                 DPSOFTRAST_Flush();
5220         dpsoftrast.fb_width = width;
5221         dpsoftrast.fb_height = height;
5222         dpsoftrast.fb_depthpixels = depthpixels;
5223         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5224         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5225         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5226         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5227         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5228         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5229         command->width = width;
5230         command->height = height;
5231 }
5232  
5233 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5234 {
5235         int commandoffset = thread->commandoffset;
5236         while (commandoffset != endoffset)
5237         {
5238                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5239                 switch (command->opcode)
5240                 {
5241 #define INTERPCOMMAND(name) \
5242                 case DPSOFTRAST_OPCODE_##name : \
5243                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5244                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5245                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5246                                 commandoffset = 0; \
5247                         break;
5248                 INTERPCOMMAND(Viewport)
5249                 INTERPCOMMAND(ClearColor)
5250                 INTERPCOMMAND(ClearDepth)
5251                 INTERPCOMMAND(ColorMask)
5252                 INTERPCOMMAND(DepthTest)
5253                 INTERPCOMMAND(ScissorTest)
5254                 INTERPCOMMAND(Scissor)
5255                 INTERPCOMMAND(BlendFunc)
5256                 INTERPCOMMAND(BlendSubtract)
5257                 INTERPCOMMAND(DepthMask)
5258                 INTERPCOMMAND(DepthFunc)
5259                 INTERPCOMMAND(DepthRange)
5260                 INTERPCOMMAND(PolygonOffset)
5261                 INTERPCOMMAND(CullFace)
5262                 INTERPCOMMAND(AlphaTest)
5263                 INTERPCOMMAND(AlphaFunc)
5264                 INTERPCOMMAND(SetTexture)
5265                 INTERPCOMMAND(SetShader)
5266                 INTERPCOMMAND(Uniform4f)
5267                 INTERPCOMMAND(UniformMatrix4f)
5268                 INTERPCOMMAND(Uniform1i)
5269                 INTERPCOMMAND(SetRenderTargets)
5270
5271                 case DPSOFTRAST_OPCODE_Draw:
5272                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5273                         commandoffset += command->commandsize;
5274                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5275                                 commandoffset = 0;
5276                         thread->commandoffset = commandoffset;
5277                         break;
5278
5279                 case DPSOFTRAST_OPCODE_Reset:
5280                         commandoffset = 0;
5281                         break;
5282                 }
5283         }
5284         thread->commandoffset = commandoffset;
5285 }
5286
5287 static int DPSOFTRAST_Draw_Thread(void *data)
5288 {
5289         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5290         while(thread->index >= 0)
5291         {
5292                 if (thread->commandoffset != dpsoftrast.drawcommand)
5293                 {
5294                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5295                 }
5296                 else 
5297                 {
5298                         Thread_LockMutex(thread->drawmutex);
5299                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5300                         {
5301                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5302                                 thread->starving = true;
5303                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5304                                 thread->starving = false;
5305                         }
5306                         Thread_UnlockMutex(thread->drawmutex);
5307                 }
5308         }   
5309         return 0;
5310 }
5311
5312 static void DPSOFTRAST_Draw_FlushThreads(void)
5313 {
5314         DPSOFTRAST_State_Thread *thread;
5315         int i;
5316         DPSOFTRAST_Draw_SyncCommands();
5317         if (dpsoftrast.usethreads) 
5318         {
5319                 for (i = 0; i < dpsoftrast.numthreads; i++)
5320                 {
5321                         thread = &dpsoftrast.threads[i];
5322                         if (thread->commandoffset != dpsoftrast.drawcommand)
5323                         {
5324                                 Thread_LockMutex(thread->drawmutex);
5325                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5326                                         Thread_CondSignal(thread->drawcond);
5327                                 Thread_UnlockMutex(thread->drawmutex);
5328                         }
5329                 }
5330                 for (i = 0; i < dpsoftrast.numthreads; i++)
5331                 {
5332                         thread = &dpsoftrast.threads[i];
5333                         if (thread->commandoffset != dpsoftrast.drawcommand)
5334                         {
5335                                 Thread_LockMutex(thread->drawmutex);
5336                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5337                                 {
5338                                         thread->waiting = true;
5339                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5340                                         thread->waiting = false;
5341                                 }
5342                                 Thread_UnlockMutex(thread->drawmutex);
5343                         }
5344                 }
5345         }
5346         else
5347         {
5348                 for (i = 0; i < dpsoftrast.numthreads; i++)
5349                 {
5350                         thread = &dpsoftrast.threads[i];
5351                         if (thread->commandoffset != dpsoftrast.drawcommand)
5352                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5353                 }
5354         }
5355         dpsoftrast.commandpool.usedcommands = 0;
5356 }
5357
5358 void DPSOFTRAST_Flush(void)
5359 {
5360         DPSOFTRAST_Draw_FlushThreads();
5361 }
5362
5363 void DPSOFTRAST_Finish(void)
5364 {
5365         DPSOFTRAST_Flush();
5366 }
5367
5368 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5369 {
5370         int i;
5371         union
5372         {
5373                 int i;
5374                 unsigned char b[4];
5375         }
5376         u;
5377         u.i = 1;
5378         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5379         dpsoftrast.bigendian = u.b[3];
5380         dpsoftrast.fb_width = width;
5381         dpsoftrast.fb_height = height;
5382         dpsoftrast.fb_depthpixels = depthpixels;
5383         dpsoftrast.fb_colorpixels[0] = colorpixels;
5384         dpsoftrast.fb_colorpixels[1] = NULL;
5385         dpsoftrast.fb_colorpixels[1] = NULL;
5386         dpsoftrast.fb_colorpixels[1] = NULL;
5387         dpsoftrast.viewport[0] = 0;
5388         dpsoftrast.viewport[1] = 0;
5389         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5390         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5391         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5392         dpsoftrast.texture_firstfree = 1;
5393         dpsoftrast.texture_end = 1;
5394         dpsoftrast.texture_max = 0;
5395         dpsoftrast.color[0] = 1;
5396         dpsoftrast.color[1] = 1;
5397         dpsoftrast.color[2] = 1;
5398         dpsoftrast.color[3] = 1;
5399         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5400         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5401         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5402         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5403         for (i = 0; i < dpsoftrast.numthreads; i++)
5404         {
5405                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5406                 thread->index = i;
5407                 thread->cullface = GL_BACK;
5408                 thread->colormask[1] = 1;
5409                 thread->colormask[2] = 1;
5410                 thread->colormask[3] = 1;
5411                 thread->blendfunc[0] = GL_ONE;
5412                 thread->blendfunc[1] = GL_ZERO;
5413                 thread->depthmask = true;
5414                 thread->depthtest = true;
5415                 thread->depthfunc = GL_LEQUAL;
5416                 thread->scissortest = false;
5417                 thread->alphatest = false;
5418                 thread->alphafunc = GL_GREATER;
5419                 thread->alphavalue = 0.5f;
5420                 thread->viewport[0] = 0;
5421                 thread->viewport[1] = 0;
5422                 thread->viewport[2] = dpsoftrast.fb_width;
5423                 thread->viewport[3] = dpsoftrast.fb_height;
5424                 thread->scissor[0] = 0;
5425                 thread->scissor[1] = 0;
5426                 thread->scissor[2] = dpsoftrast.fb_width;
5427                 thread->scissor[3] = dpsoftrast.fb_height;
5428                 thread->depthrange[0] = 0;
5429                 thread->depthrange[1] = 1;
5430                 thread->polygonoffset[0] = 0;
5431                 thread->polygonoffset[1] = 0;
5432         
5433                 DPSOFTRAST_RecalcThread(thread);
5434         
5435                 thread->numspans = 0;
5436                 thread->numtriangles = 0;
5437                 thread->commandoffset = 0;
5438                 thread->waiting = false;
5439                 thread->starving = false;
5440            
5441                 thread->validate = -1;
5442                 DPSOFTRAST_Validate(thread, -1);
5443  
5444                 if (dpsoftrast.usethreads)
5445                 {
5446                         thread->waitcond = Thread_CreateCond();
5447                         thread->drawcond = Thread_CreateCond();
5448                         thread->drawmutex = Thread_CreateMutex();
5449                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5450                 }
5451         }
5452         return 0;
5453 }
5454
5455 void DPSOFTRAST_Shutdown(void)
5456 {
5457         int i;
5458         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5459         {
5460                 DPSOFTRAST_State_Thread *thread;
5461                 for (i = 0; i < dpsoftrast.numthreads; i++)
5462                 {
5463                         thread = &dpsoftrast.threads[i];
5464                         Thread_LockMutex(thread->drawmutex);
5465                         thread->index = -1;
5466                         Thread_CondSignal(thread->drawcond);
5467                         Thread_UnlockMutex(thread->drawmutex);
5468                         Thread_WaitThread(thread->thread, 0);
5469                         Thread_DestroyCond(thread->waitcond);
5470                         Thread_DestroyCond(thread->drawcond);
5471                         Thread_DestroyMutex(thread->drawmutex);
5472                 }
5473         }
5474         for (i = 0;i < dpsoftrast.texture_end;i++)
5475                 if (dpsoftrast.texture[i].bytes)
5476                         MM_FREE(dpsoftrast.texture[i].bytes);
5477         if (dpsoftrast.texture)
5478                 free(dpsoftrast.texture);
5479         if (dpsoftrast.threads)
5480                 MM_FREE(dpsoftrast.threads);
5481         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5482 }
5483