]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
factor out thread interface from dpsoftrast
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int bh;
1154         int x;
1155         int y;
1156         unsigned char *inpixels;
1157         unsigned char *b;
1158         unsigned char *o;
1159         DPSOFTRAST_Flush();
1160         if (bx1 < 0) bx1 = 0;
1161         if (by1 < 0) by1 = 0;
1162         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1163         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1164         bw = bx2 - bx1;
1165         bh = by2 - by1;
1166         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1167         if (dpsoftrast.bigendian)
1168         {
1169                 for (y = by1;y < by2;y++)
1170                 {
1171                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1172                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1173                         for (x = bx1;x < bx2;x++)
1174                         {
1175                                 o[0] = b[3];
1176                                 o[1] = b[2];
1177                                 o[2] = b[1];
1178                                 o[3] = b[0];
1179                                 o += 4;
1180                                 b += 4;
1181                         }
1182                 }
1183         }
1184         else
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         memcpy(o, b, bw*4);
1191                 }
1192         }
1193
1194 }
1195 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1196 {
1197         int tx1 = tx;
1198         int ty1 = ty;
1199         int tx2 = tx + width;
1200         int ty2 = ty + height;
1201         int sx1 = sx;
1202         int sy1 = sy;
1203         int sx2 = sx + width;
1204         int sy2 = sy + height;
1205         int swidth;
1206         int sheight;
1207         int twidth;
1208         int theight;
1209         int sw;
1210         int sh;
1211         int tw;
1212         int th;
1213         int y;
1214         unsigned int *spixels;
1215         unsigned int *tpixels;
1216         DPSOFTRAST_Texture *texture;
1217         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1218         if (mip < 0 || mip >= texture->mipmaps) return;
1219         if (texture->binds)
1220                 DPSOFTRAST_Flush();
1221         spixels = dpsoftrast.fb_colorpixels[0];
1222         swidth = dpsoftrast.fb_width;
1223         sheight = dpsoftrast.fb_height;
1224         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1225         twidth = texture->mipmap[mip][2];
1226         theight = texture->mipmap[mip][3];
1227         if (tx1 < 0) tx1 = 0;
1228         if (ty1 < 0) ty1 = 0;
1229         if (tx2 > twidth) tx2 = twidth;
1230         if (ty2 > theight) ty2 = theight;
1231         if (sx1 < 0) sx1 = 0;
1232         if (sy1 < 0) sy1 = 0;
1233         if (sx2 > swidth) sx2 = swidth;
1234         if (sy2 > sheight) sy2 = sheight;
1235         tw = tx2 - tx1;
1236         th = ty2 - ty1;
1237         sw = sx2 - sx1;
1238         sh = sy2 - sy1;
1239         if (tw > sw) tw = sw;
1240         if (th > sh) th = sh;
1241         if (tw < 1 || th < 1)
1242                 return;
1243         for (y = 0;y < th;y++)
1244                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1245         if (texture->mipmaps > 1)
1246                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1247 }
1248
1249 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1250 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1251 {
1252         if (thread->texbound[command->unitnum])
1253                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1254         thread->texbound[command->unitnum] = command->texture;
1255 }
1256 void DPSOFTRAST_SetTexture(int unitnum, int index)
1257 {
1258         DPSOFTRAST_Command_SetTexture *command;
1259         DPSOFTRAST_Texture *texture;
1260         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1261         {
1262                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1263                 return;
1264         }
1265         texture = DPSOFTRAST_Texture_GetByIndex(index);
1266         if (index && !texture)
1267         {
1268                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1269                 return;
1270         }
1271
1272         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1273         command->unitnum = unitnum;
1274         command->texture = texture;
1275
1276         dpsoftrast.texbound[unitnum] = texture;
1277         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1278 }
1279
1280 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1281 {
1282         dpsoftrast.pointer_vertex3f = vertex3f;
1283         dpsoftrast.stride_vertex = stride;
1284 }
1285 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1286 {
1287         dpsoftrast.pointer_color4f = color4f;
1288         dpsoftrast.pointer_color4ub = NULL;
1289         dpsoftrast.stride_color = stride;
1290 }
1291 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1292 {
1293         dpsoftrast.pointer_color4f = NULL;
1294         dpsoftrast.pointer_color4ub = color4ub;
1295         dpsoftrast.stride_color = stride;
1296 }
1297 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1298 {
1299         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1300         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1301         dpsoftrast.stride_texcoord[unitnum] = stride;
1302 }
1303
1304 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1305 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1306 {
1307         thread->shader_mode = command->mode;
1308         thread->shader_permutation = command->permutation;
1309 }
1310 void DPSOFTRAST_SetShader(int mode, int permutation)
1311 {
1312         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1313         command->mode = mode;
1314         command->permutation = permutation;
1315
1316         dpsoftrast.shader_mode = mode;
1317         dpsoftrast.shader_permutation = permutation;
1318 }
1319
1320 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1321 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1322 {
1323         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1324 }
1325 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1326 {
1327         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1328         command->index = index;
1329         command->val[0] = v0;
1330         command->val[1] = v1;
1331         command->val[2] = v2;
1332         command->val[3] = v3;
1333
1334         dpsoftrast.uniform4f[index*4+0] = v0;
1335         dpsoftrast.uniform4f[index*4+1] = v1;
1336         dpsoftrast.uniform4f[index*4+2] = v2;
1337         dpsoftrast.uniform4f[index*4+3] = v3;
1338 }
1339 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1340 {
1341         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1342         command->index = index;
1343         memcpy(command->val, v, sizeof(command->val));
1344
1345         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1346 }
1347
1348 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1349 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1350 {
1351         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1352 }
1353 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1354 {
1355 #ifdef SSE2_PRESENT
1356         int i, index;
1357         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1358         {
1359                 __m128 m0, m1, m2, m3;
1360                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1361                 command->index = index;
1362                 if (((size_t)v)&(ALIGN_SIZE-1))
1363                 {
1364                         m0 = _mm_loadu_ps(v);
1365                         m1 = _mm_loadu_ps(v+4);
1366                         m2 = _mm_loadu_ps(v+8);
1367                         m3 = _mm_loadu_ps(v+12);
1368                 }
1369                 else
1370                 {
1371                         m0 = _mm_load_ps(v);
1372                         m1 = _mm_load_ps(v+4);
1373                         m2 = _mm_load_ps(v+8);
1374                         m3 = _mm_load_ps(v+12);
1375                 }
1376                 if (transpose)
1377                 {
1378                         __m128 t0, t1, t2, t3;
1379                         t0 = _mm_unpacklo_ps(m0, m1);
1380                         t1 = _mm_unpacklo_ps(m2, m3);
1381                         t2 = _mm_unpackhi_ps(m0, m1);
1382                         t3 = _mm_unpackhi_ps(m2, m3);
1383                         m0 = _mm_movelh_ps(t0, t1);
1384                         m1 = _mm_movehl_ps(t1, t0);
1385                         m2 = _mm_movelh_ps(t2, t3);
1386                         m3 = _mm_movehl_ps(t3, t2);                     
1387                 }
1388                 _mm_store_ps(command->val, m0);
1389                 _mm_store_ps(command->val+4, m1);
1390                 _mm_store_ps(command->val+8, m2);
1391                 _mm_store_ps(command->val+12, m3);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1393                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1394                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1395                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1396         }
1397 #endif
1398 }
1399
1400 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1401 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1402 {
1403         thread->uniform1i[command->index] = command->val;
1404 }
1405 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1406 {
1407         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1408         command->index = index;
1409         command->val = i0;
1410
1411         dpsoftrast.uniform1i[command->index] = i0;
1412 }
1413
1414 #ifdef SSE2_PRESENT
1415 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1416 {
1417         float *end = dst + size*4;
1418         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1419         {
1420                 while (dst < end)
1421                 {
1422                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1423                         dst += 4;
1424                         src += stride;
1425                 }
1426         }
1427         else
1428         {
1429                 while (dst < end)
1430                 {
1431                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1432                         dst += 4;
1433                         src += stride;
1434                 }
1435         }
1436 }
1437
1438 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1439 {
1440         float *end = dst + size*4;
1441         if (stride == sizeof(float[3]))
1442         {
1443                 float *end4 = dst + (size&~3)*4;        
1444                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1445                 {
1446                         while (dst < end4)
1447                         {
1448                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1449                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1453                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1454                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1455                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1456                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1460                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461                                 dst += 16;
1462                                 src += 4*sizeof(float[3]);
1463                         }
1464                 }
1465                 else
1466                 {
1467                         while (dst < end4)
1468                         {
1469                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1470                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1477                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1481                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482                                 dst += 16;
1483                                 src += 4*sizeof(float[3]);
1484                         }
1485                 }
1486         }
1487         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1488         {
1489                 while (dst < end)
1490                 {
1491                         __m128 v = _mm_loadu_ps((const float *)src);
1492                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1493                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1494                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1495                         _mm_store_ps(dst, v);
1496                         dst += 4;
1497                         src += stride;
1498                 }
1499         }
1500         else
1501         {
1502                 while (dst < end)
1503                 {
1504                         __m128 v = _mm_load_ps((const float *)src);
1505                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1506                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1507                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1508                         _mm_store_ps(dst, v);
1509                         dst += 4;
1510                         src += stride;
1511                 }
1512         }
1513 }
1514
1515 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1516 {
1517         float *end = dst + size*4;
1518         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1519         if (stride == sizeof(float[2]))
1520         {
1521                 float *end2 = dst + (size&~1)*4;
1522                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1523                 {
1524                         while (dst < end2)
1525                         {
1526                                 __m128 v = _mm_loadu_ps((const float *)src);
1527                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1528                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1529                                 dst += 8;
1530                                 src += 2*sizeof(float[2]);
1531                         }
1532                 }
1533                 else
1534                 {
1535                         while (dst < end2)
1536                         {
1537                                 __m128 v = _mm_load_ps((const float *)src);
1538                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1539                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1540                                 dst += 8;
1541                                 src += 2*sizeof(float[2]);
1542                         }
1543                 }
1544         }
1545         while (dst < end)
1546         {
1547                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1548                 dst += 4;
1549                 src += stride;
1550         }
1551 }
1552
1553 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1554 {
1555         float *end = dst + size*4;
1556         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1557         if (stride == sizeof(unsigned char[4]))
1558         {
1559                 float *end4 = dst + (size&~3)*4;
1560                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1561                 {
1562                         while (dst < end4)
1563                         {
1564                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1565                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1566                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1567                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1568                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1569                                 dst += 16;
1570                                 src += 4*sizeof(unsigned char[4]);
1571                         }
1572                 }
1573                 else
1574                 {
1575                         while (dst < end4)
1576                         {
1577                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1578                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1579                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1580                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1581                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1582                                 dst += 16;
1583                                 src += 4*sizeof(unsigned char[4]);
1584                         }
1585                 }
1586         }
1587         while (dst < end)
1588         {
1589                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1590                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1591                 dst += 4;
1592                 src += stride;
1593         }
1594 }
1595
1596 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1597 {
1598         float *end = dst + 4*size;
1599         __m128 v = _mm_loadu_ps(src);
1600         while (dst < end)
1601         {
1602                 _mm_store_ps(dst, v);
1603                 dst += 4;
1604         }
1605 }
1606 #endif
1607
1608 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1609 {
1610 #ifdef SSE2_PRESENT
1611         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1612         __m128 m0, m1, m2, m3;
1613         float *end;
1614         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1615         {
1616                 // fast case for identity matrix
1617                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1618                 return;
1619         }
1620         end = out4f + numitems*4;
1621         m0 = _mm_loadu_ps(inmatrix16f);
1622         m1 = _mm_loadu_ps(inmatrix16f + 4);
1623         m2 = _mm_loadu_ps(inmatrix16f + 8);
1624         m3 = _mm_loadu_ps(inmatrix16f + 12);
1625         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1626         {
1627                 while (out4f < end)
1628                 {
1629                         __m128 v = _mm_loadu_ps(in4f);
1630                         _mm_store_ps(out4f,
1631                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1632                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1633                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1634                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1635                         out4f += 4;
1636                         in4f += 4;
1637                 }
1638         }
1639         else
1640         {
1641                 while (out4f < end)
1642                 {
1643                         __m128 v = _mm_load_ps(in4f);
1644                         _mm_store_ps(out4f,
1645                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1646                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1647                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1648                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1649                         out4f += 4;
1650                         in4f += 4;
1651                 }
1652         }
1653 #endif
1654 }
1655
1656 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1657 {
1658         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1659 }
1660
1661 #ifdef SSE2_PRESENT
1662 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1663 { \
1664         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1665         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1666         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1667         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1668 }
1669
1670 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1671 { \
1672         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1673         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1674         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1675         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1676 }
1677
1678 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1679 { \
1680         __m128 p = (in); \
1681         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1682                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1683                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1684                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1685 }
1686
1687 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1688 {
1689         int clipmask = 0xFF;
1690         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1691         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1692         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1693         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1694         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1695         #define BBFRONT(k, pos) \
1696         { \
1697                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1698                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1699                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1700                 { \
1701                         __m128 proj; \
1702                         clipmask &= ~(1<<k); \
1703                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1704                         minproj = _mm_min_ss(minproj, proj); \
1705                         maxproj = _mm_max_ss(maxproj, proj); \
1706                 } \
1707         }
1708         BBFRONT(0, minpos); 
1709         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1710         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1712         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1713         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1714         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1715         BBFRONT(7, maxpos);
1716         #define BBCLIP(k) \
1717         { \
1718                 if (clipmask&(1<<k)) \
1719                 { \
1720                         if (!(clipmask&(1<<(k^1)))) \
1721                         { \
1722                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1723                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1724                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1725                                 minproj = _mm_min_ss(minproj, proj); \
1726                                 maxproj = _mm_max_ss(maxproj, proj); \
1727                         } \
1728                         if (!(clipmask&(1<<(k^2)))) \
1729                         { \
1730                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1731                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1732                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1733                                 minproj = _mm_min_ss(minproj, proj); \
1734                                 maxproj = _mm_max_ss(maxproj, proj); \
1735                         } \
1736                         if (!(clipmask&(1<<(k^4)))) \
1737                         { \
1738                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1739                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1740                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1741                                 minproj = _mm_min_ss(minproj, proj); \
1742                                 maxproj = _mm_max_ss(maxproj, proj); \
1743                         } \
1744                 } \
1745         }
1746         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1747         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1748         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1749         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1750         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1751         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1752         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1753         *starty = _mm_cvttss_si32(maxproj);
1754         *endy = _mm_cvttss_si32(minproj)+1;
1755         return clipmask;
1756 }
1757         
1758 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1759 {
1760         float *end = out4f + numitems*4;
1761         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1762         __m128 minpos, maxpos;
1763         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1764         {
1765                 minpos = maxpos = _mm_loadu_ps(in4f);
1766                 while (out4f < end)
1767                 {
1768                         __m128 v = _mm_loadu_ps(in4f);
1769                         minpos = _mm_min_ps(minpos, v);
1770                         maxpos = _mm_max_ps(maxpos, v);
1771                         _mm_store_ps(out4f, v);
1772                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1773                         _mm_store_ps(screen4f, v);
1774                         in4f += 4;
1775                         out4f += 4;
1776                         screen4f += 4;
1777                 }
1778         }
1779         else
1780         {
1781                 minpos = maxpos = _mm_load_ps(in4f);
1782                 while (out4f < end)
1783                 {
1784                         __m128 v = _mm_load_ps(in4f);
1785                         minpos = _mm_min_ps(minpos, v);
1786                         maxpos = _mm_max_ps(maxpos, v);
1787                         _mm_store_ps(out4f, v);
1788                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1789                         _mm_store_ps(screen4f, v);
1790                         in4f += 4;
1791                         out4f += 4;
1792                         screen4f += 4;
1793                 }
1794         }
1795         if (starty && endy) 
1796                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1797                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1798                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1799                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1800                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1801         return 0;
1802 }
1803
1804 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1805 {
1806         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1807         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1808         float *end;
1809         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1810                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1811         end = out4f + numitems*4;
1812         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1813         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1814         m0 = _mm_loadu_ps(inmatrix16f);
1815         m1 = _mm_loadu_ps(inmatrix16f + 4);
1816         m2 = _mm_loadu_ps(inmatrix16f + 8);
1817         m3 = _mm_loadu_ps(inmatrix16f + 12);
1818         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819         {
1820                 minpos = maxpos = _mm_loadu_ps(in4f);
1821                 while (out4f < end)
1822                 {
1823                         __m128 v = _mm_loadu_ps(in4f);
1824                         minpos = _mm_min_ps(minpos, v);
1825                         maxpos = _mm_max_ps(maxpos, v);
1826                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1827                         _mm_store_ps(out4f, v);
1828                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1829                         _mm_store_ps(screen4f, v);
1830                         in4f += 4;
1831                         out4f += 4;
1832                         screen4f += 4;
1833                 }
1834         }
1835         else
1836         {
1837                 minpos = maxpos = _mm_load_ps(in4f);
1838                 while (out4f < end)
1839                 {
1840                         __m128 v = _mm_load_ps(in4f);
1841                         minpos = _mm_min_ps(minpos, v);
1842                         maxpos = _mm_max_ps(maxpos, v);
1843                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1844                         _mm_store_ps(out4f, v);
1845                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1846                         _mm_store_ps(screen4f, v);
1847                         in4f += 4;
1848                         out4f += 4;
1849                         screen4f += 4;
1850                 }
1851         }
1852         if (starty && endy) 
1853                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1854         return 0;
1855 }
1856 #endif
1857
1858 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1859 {
1860 #ifdef SSE2_PRESENT
1861         float *outf = dpsoftrast.post_array4f[outarray];
1862         const unsigned char *inb;
1863         int firstvertex = dpsoftrast.firstvertex;
1864         int numvertices = dpsoftrast.numvertices;
1865         int stride;
1866         switch(inarray)
1867         {
1868         case DPSOFTRAST_ARRAY_POSITION:
1869                 stride = dpsoftrast.stride_vertex;
1870                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1871                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1872                 break;
1873         case DPSOFTRAST_ARRAY_COLOR:
1874                 stride = dpsoftrast.stride_color;
1875                 if (dpsoftrast.pointer_color4f)
1876                 {
1877                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1878                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1879                 }
1880                 else if (dpsoftrast.pointer_color4ub)
1881                 {
1882                         stride = dpsoftrast.stride_color;
1883                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1884                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1885                 }
1886                 else
1887                 {
1888                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1889                 }
1890                 break;
1891         default:
1892                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1893                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894                 {
1895                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1896                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1897                         {
1898                         case 2:
1899                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1900                                 break;
1901                         case 3:
1902                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1903                                 break;
1904                         case 4:
1905                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1906                                 break;
1907                         }
1908                 }
1909                 break;
1910         }
1911         return outf;
1912 #else
1913         return NULL;
1914 #endif
1915 }
1916
1917 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1918 {
1919         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1920         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1921         return data;
1922 }
1923
1924 #if 0
1925 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1926 {
1927 #ifdef SSE2_PRESENT
1928         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1930         return data;
1931 #else
1932         return NULL;
1933 #endif
1934 }
1935 #endif
1936
1937 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1938 {
1939 #ifdef SSE2_PRESENT
1940         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1941         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1942         return data;
1943 #else
1944         return NULL;
1945 #endif
1946 }
1947
1948 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1949 {
1950         int x;
1951         int startx = span->startx;
1952         int endx = span->endx;
1953         float wslope = triangle->w[0];
1954         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1955         float endz = 1.0f / (w + wslope * startx);
1956         for (x = startx;x < endx;)
1957         {
1958                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959                 float z = endz, dz;
1960                 if (nextsub >= endx) nextsub = endsub = endx-1;
1961                 endz = 1.0f / (w + wslope * nextsub);
1962                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1963                 for (; x <= endsub; x++, z += dz)
1964                         zf[x] = z;
1965         }
1966 }
1967
1968 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1969 {
1970         int x;
1971         int startx = span->startx;
1972         int endx = span->endx;
1973         int d[4];
1974         float a, b;
1975         unsigned char * RESTRICT pixelmask = span->pixelmask;
1976         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1977         if (!pixel)
1978                 return;
1979         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1980         // handle alphatest now (this affects depth writes too)
1981         if (thread->alphatest)
1982                 for (x = startx;x < endx;x++)
1983                         if (in4f[x*4+3] < 0.5f)
1984                                 pixelmask[x] = false;
1985         // FIXME: this does not handle bigendian
1986         switch(thread->fb_blendmode)
1987         {
1988         case DPSOFTRAST_BLENDMODE_OPAQUE:
1989                 for (x = startx;x < endx;x++)
1990                 {
1991                         if (!pixelmask[x])
1992                                 continue;
1993                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1994                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1995                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1996                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1997                         pixel[x*4+0] = d[0];
1998                         pixel[x*4+1] = d[1];
1999                         pixel[x*4+2] = d[2];
2000                         pixel[x*4+3] = d[3];
2001                 }
2002                 break;
2003         case DPSOFTRAST_BLENDMODE_ALPHA:
2004                 for (x = startx;x < endx;x++)
2005                 {
2006                         if (!pixelmask[x])
2007                                 continue;
2008                         a = in4f[x*4+3] * 255.0f;
2009                         b = 1.0f - in4f[x*4+3];
2010                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2011                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2012                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2013                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2014                         pixel[x*4+0] = d[0];
2015                         pixel[x*4+1] = d[1];
2016                         pixel[x*4+2] = d[2];
2017                         pixel[x*4+3] = d[3];
2018                 }
2019                 break;
2020         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2021                 for (x = startx;x < endx;x++)
2022                 {
2023                         if (!pixelmask[x])
2024                                 continue;
2025                         a = in4f[x*4+3] * 255.0f;
2026                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030                         pixel[x*4+0] = d[0];
2031                         pixel[x*4+1] = d[1];
2032                         pixel[x*4+2] = d[2];
2033                         pixel[x*4+3] = d[3];
2034                 }
2035                 break;
2036         case DPSOFTRAST_BLENDMODE_ADD:
2037                 for (x = startx;x < endx;x++)
2038                 {
2039                         if (!pixelmask[x])
2040                                 continue;
2041                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045                         pixel[x*4+0] = d[0];
2046                         pixel[x*4+1] = d[1];
2047                         pixel[x*4+2] = d[2];
2048                         pixel[x*4+3] = d[3];
2049                 }
2050                 break;
2051         case DPSOFTRAST_BLENDMODE_INVMOD:
2052                 for (x = startx;x < endx;x++)
2053                 {
2054                         if (!pixelmask[x])
2055                                 continue;
2056                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060                         pixel[x*4+0] = d[0];
2061                         pixel[x*4+1] = d[1];
2062                         pixel[x*4+2] = d[2];
2063                         pixel[x*4+3] = d[3];
2064                 }
2065                 break;
2066         case DPSOFTRAST_BLENDMODE_MUL:
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (!pixelmask[x])
2070                                 continue;
2071                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2072                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2073                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2074                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2075                         pixel[x*4+0] = d[0];
2076                         pixel[x*4+1] = d[1];
2077                         pixel[x*4+2] = d[2];
2078                         pixel[x*4+3] = d[3];
2079                 }
2080                 break;
2081         case DPSOFTRAST_BLENDMODE_MUL2:
2082                 for (x = startx;x < endx;x++)
2083                 {
2084                         if (!pixelmask[x])
2085                                 continue;
2086                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2087                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2088                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2089                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2090                         pixel[x*4+0] = d[0];
2091                         pixel[x*4+1] = d[1];
2092                         pixel[x*4+2] = d[2];
2093                         pixel[x*4+3] = d[3];
2094                 }
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2097                 for (x = startx;x < endx;x++)
2098                 {
2099                         if (!pixelmask[x])
2100                                 continue;
2101                         a = in4f[x*4+3] * -255.0f;
2102                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2103                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2104                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2105                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2106                         pixel[x*4+0] = d[0];
2107                         pixel[x*4+1] = d[1];
2108                         pixel[x*4+2] = d[2];
2109                         pixel[x*4+3] = d[3];
2110                 }
2111                 break;
2112         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2113                 for (x = startx;x < endx;x++)
2114                 {
2115                         if (!pixelmask[x])
2116                                 continue;
2117                         a = 255.0f;
2118                         b = 1.0f - in4f[x*4+3];
2119                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2120                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2121                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2122                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2123                         pixel[x*4+0] = d[0];
2124                         pixel[x*4+1] = d[1];
2125                         pixel[x*4+2] = d[2];
2126                         pixel[x*4+3] = d[3];
2127                 }
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_INVADD:
2130                 for (x = startx;x < endx;x++)
2131                 {
2132                         if (!pixelmask[x])
2133                                 continue;
2134                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2135                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2137                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138                         pixel[x*4+0] = d[0];
2139                         pixel[x*4+1] = d[1];
2140                         pixel[x*4+2] = d[2];
2141                         pixel[x*4+3] = d[3];
2142                 }
2143                 break;
2144         }
2145 }
2146
2147 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2148 {
2149 #ifdef SSE2_PRESENT
2150         int x;
2151         int startx = span->startx;
2152         int endx = span->endx;
2153         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2154         unsigned char * RESTRICT pixelmask = span->pixelmask;
2155         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2156         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2157         if (!pixel)
2158                 return;
2159         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2160         pixeli += span->y * dpsoftrast.fb_width + span->x;
2161         // handle alphatest now (this affects depth writes too)
2162         if (thread->alphatest)
2163                 for (x = startx;x < endx;x++)
2164                         if (in4ub[x*4+3] < 0.5f)
2165                                 pixelmask[x] = false;
2166         // FIXME: this does not handle bigendian
2167         switch(thread->fb_blendmode)
2168         {
2169         case DPSOFTRAST_BLENDMODE_OPAQUE:
2170                 for (x = startx;x + 4 <= endx;)
2171                 {
2172                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2173                         {
2174                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2175                                 x += 4;
2176                         }
2177                         else
2178                         {
2179                                 if (pixelmask[x])
2180                                         pixeli[x] = ini[x];
2181                                 x++;
2182                         }
2183                 }
2184                 for (;x < endx;x++)
2185                         if (pixelmask[x])
2186                                 pixeli[x] = ini[x];
2187                 break;
2188         case DPSOFTRAST_BLENDMODE_ALPHA:
2189         #define FINISHBLEND(blend2, blend1) \
2190                 for (x = startx;x + 1 < endx;x += 2) \
2191                 { \
2192                         __m128i src, dst; \
2193                         switch (*(const unsigned short*)&pixelmask[x]) \
2194                         { \
2195                         case 0x0101: \
2196                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2197                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2198                                 blend2; \
2199                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2200                                 continue; \
2201                         case 0x0100: \
2202                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2203                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2204                                 blend1; \
2205                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2206                                 continue; \
2207                         case 0x0001: \
2208                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2209                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2210                                 blend1; \
2211                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2212                                 continue; \
2213                         } \
2214                         break; \
2215                 } \
2216                 for(;x < endx; x++) \
2217                 { \
2218                         __m128i src, dst; \
2219                         if (!pixelmask[x]) \
2220                                 continue; \
2221                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2222                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223                         blend1; \
2224                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225                 }
2226
2227                 FINISHBLEND({
2228                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2229                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                 }, {
2231                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2232                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2233                 });
2234                 break;
2235         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236                 FINISHBLEND({
2237                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                 }, {
2240                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2242                 });
2243                 break;
2244         case DPSOFTRAST_BLENDMODE_ADD:
2245                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246                 break;
2247         case DPSOFTRAST_BLENDMODE_INVMOD:
2248                 FINISHBLEND({
2249                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250                 }, {
2251                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2252                 });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_MUL:
2255                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256                 break;
2257         case DPSOFTRAST_BLENDMODE_MUL2:
2258                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259                 break;
2260         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261                 FINISHBLEND({
2262                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2263                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                 }, {
2265                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2266                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2267                 });
2268                 break;
2269         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270                 FINISHBLEND({
2271                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                 }, {
2274                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2276                 });
2277                 break;
2278         case DPSOFTRAST_BLENDMODE_INVADD:
2279                 FINISHBLEND({
2280                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281                 }, {
2282                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2283                 });
2284                 break;
2285         }
2286 #endif
2287 }
2288
2289 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2290 {
2291         int x;
2292         int startx = span->startx;
2293         int endx = span->endx;
2294         int flags;
2295         float c[4];
2296         float data[4];
2297         float slope[4];
2298         float tc[2], endtc[2];
2299         float tcscale[2];
2300         unsigned int tci[2];
2301         unsigned int tci1[2];
2302         unsigned int tcimin[2];
2303         unsigned int tcimax[2];
2304         int tciwrapmask[2];
2305         int tciwidth;
2306         int filter;
2307         int mip;
2308         const unsigned char * RESTRICT pixelbase;
2309         const unsigned char * RESTRICT pixel[4];
2310         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2311         // if no texture is bound, just fill it with white
2312         if (!texture)
2313         {
2314                 for (x = startx;x < endx;x++)
2315                 {
2316                         out4f[x*4+0] = 1.0f;
2317                         out4f[x*4+1] = 1.0f;
2318                         out4f[x*4+2] = 1.0f;
2319                         out4f[x*4+3] = 1.0f;
2320                 }
2321                 return;
2322         }
2323         mip = triangle->mip[texunitindex];
2324         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2325         // if this mipmap of the texture is 1 pixel, just fill it with that color
2326         if (texture->mipmap[mip][1] == 4)
2327         {
2328                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2329                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2330                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2331                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2332                 for (x = startx;x < endx;x++)
2333                 {
2334                         out4f[x*4+0] = c[0];
2335                         out4f[x*4+1] = c[1];
2336                         out4f[x*4+2] = c[2];
2337                         out4f[x*4+3] = c[3];
2338                 }
2339                 return;
2340         }
2341         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2342         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2343         flags = texture->flags;
2344         tcscale[0] = texture->mipmap[mip][2];
2345         tcscale[1] = texture->mipmap[mip][3];
2346         tciwidth = texture->mipmap[mip][2];
2347         tcimin[0] = 0;
2348         tcimin[1] = 0;
2349         tcimax[0] = texture->mipmap[mip][2]-1;
2350         tcimax[1] = texture->mipmap[mip][3]-1;
2351         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2352         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2353         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2354         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2355         for (x = startx;x < endx;)
2356         {
2357                 unsigned int subtc[2];
2358                 unsigned int substep[2];
2359                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2360                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2361                 if (nextsub >= endx)
2362                 {
2363                         nextsub = endsub = endx-1;      
2364                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2365                 }
2366                 tc[0] = endtc[0];
2367                 tc[1] = endtc[1];
2368                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2369                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2370                 substep[0] = (endtc[0] - tc[0]) * subscale;
2371                 substep[1] = (endtc[1] - tc[1]) * subscale;
2372                 subtc[0] = tc[0] * (1<<16);
2373                 subtc[1] = tc[1] * (1<<16);
2374                 if (filter)
2375                 {
2376                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2377                         {
2378                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2379                                 {
2380                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2381                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2382                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2383                                         tci[0] = subtc[0]>>16;
2384                                         tci[1] = subtc[1]>>16;
2385                                         tci1[0] = tci[0] + 1;
2386                                         tci1[1] = tci[1] + 1;
2387                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2388                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2389                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2390                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2391                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2392                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2393                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2394                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2395                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2397                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2398                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2399                                         out4f[x*4+0] = c[0];
2400                                         out4f[x*4+1] = c[1];
2401                                         out4f[x*4+2] = c[2];
2402                                         out4f[x*4+3] = c[3];
2403                                 }
2404                         }
2405                         else
2406                         {
2407                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2408                                 {
2409                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2410                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2411                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2412                                         tci[0] = subtc[0]>>16;
2413                                         tci[1] = subtc[1]>>16;
2414                                         tci1[0] = tci[0] + 1;
2415                                         tci1[1] = tci[1] + 1;
2416                                         tci[0] &= tciwrapmask[0];
2417                                         tci[1] &= tciwrapmask[1];
2418                                         tci1[0] &= tciwrapmask[0];
2419                                         tci1[1] &= tciwrapmask[1];
2420                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2422                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2423                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2424                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2426                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2427                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2428                                         out4f[x*4+0] = c[0];
2429                                         out4f[x*4+1] = c[1];
2430                                         out4f[x*4+2] = c[2];
2431                                         out4f[x*4+3] = c[3];
2432                                 }
2433                         }
2434                 }
2435                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2436                 {
2437                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2438                         {
2439                                 tci[0] = subtc[0]>>16;
2440                                 tci[1] = subtc[1]>>16;
2441                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2442                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2443                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2444                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2445                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2446                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2447                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2448                                 out4f[x*4+0] = c[0];
2449                                 out4f[x*4+1] = c[1];
2450                                 out4f[x*4+2] = c[2];
2451                                 out4f[x*4+3] = c[3];
2452                         }
2453                 }
2454                 else
2455                 {
2456                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457                         {
2458                                 tci[0] = subtc[0]>>16;
2459                                 tci[1] = subtc[1]>>16;
2460                                 tci[0] &= tciwrapmask[0];
2461                                 tci[1] &= tciwrapmask[1];
2462                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2464                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2465                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2466                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2467                                 out4f[x*4+0] = c[0];
2468                                 out4f[x*4+1] = c[1];
2469                                 out4f[x*4+2] = c[2];
2470                                 out4f[x*4+3] = c[3];
2471                         }
2472                 }
2473         }
2474 }
2475
2476 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2477 {
2478 #ifdef SSE2_PRESENT
2479         int x;
2480         int startx = span->startx;
2481         int endx = span->endx;
2482         int flags;
2483         __m128 data, slope, tcscale;
2484         __m128i tcsize, tcmask, tcoffset, tcmax;
2485         __m128 tc, endtc;
2486         __m128i subtc, substep, endsubtc;
2487         int filter;
2488         int mip;
2489         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2490         const unsigned char * RESTRICT pixelbase;
2491         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2492         // if no texture is bound, just fill it with white
2493         if (!texture)
2494         {
2495                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2496                 return;
2497         }
2498         mip = triangle->mip[texunitindex];
2499         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2500         // if this mipmap of the texture is 1 pixel, just fill it with that color
2501         if (texture->mipmap[mip][1] == 4)
2502         {
2503                 unsigned int k = *((const unsigned int *)pixelbase);
2504                 for (x = startx;x < endx;x++)
2505                         outi[x] = k;
2506                 return;
2507         }
2508         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2509         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2510         flags = texture->flags;
2511         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2512         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2513         tcscale = _mm_cvtepi32_ps(tcsize);
2514         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2515         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2516         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2517         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2518         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2519         tcmax = _mm_packs_epi32(tcmask, tcmask);
2520         for (x = startx;x < endx;)
2521         {
2522                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2523                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2524                 if (nextsub >= endx)
2525                 {
2526                         nextsub = endsub = endx-1;
2527                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2528                 }       
2529                 tc = endtc;
2530                 subtc = endsubtc;
2531                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2532                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2533                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2534                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2535                 substep = _mm_slli_epi32(substep, 1);
2536                 if (filter)
2537                 {
2538                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2539                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2540                         {
2541                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2542                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2543                                 {
2544                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2545                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2546                                         tci = _mm_madd_epi16(tci, tcoffset);
2547                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2548                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2549                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2550                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2551                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2552                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2553                                         fracm = _mm_srli_epi16(subtc, 1);
2554                                         pix1 = _mm_add_epi16(pix1,
2555                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2556                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2557                                         pix3 = _mm_add_epi16(pix3,
2558                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2559                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2560                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2561                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2562                                         pix2 = _mm_add_epi16(pix2,
2563                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2564                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2565                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2566                                 }
2567                                 if (x <= endsub)
2568                                 {
2569                                         const unsigned char * RESTRICT ptr1;
2570                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2571                                         tci = _mm_madd_epi16(tci, tcoffset);
2572                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2573                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2574                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2575                                         fracm = _mm_srli_epi16(subtc, 1);
2576                                         pix1 = _mm_add_epi16(pix1,
2577                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2578                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2579                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2580                                         pix1 = _mm_add_epi16(pix1,
2581                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2582                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2583                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2584                                         x++;
2585                                 }
2586                         }
2587                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2588                         {
2589                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2590                                 {
2591                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2592                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2593                                         tci = _mm_madd_epi16(tci, tcoffset);
2594                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2596                                                                                         _mm_setzero_si128());
2597                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2598                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2599                                                                                         _mm_setzero_si128());
2600                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2601                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2602                                         tci = _mm_madd_epi16(tci, tcoffset);
2603                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2607                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2608                                                                                         _mm_setzero_si128());
2609                                         fracm = _mm_srli_epi16(subtc, 1);
2610                                         pix1 = _mm_add_epi16(pix1,
2611                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2612                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2613                                         pix3 = _mm_add_epi16(pix3,
2614                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2615                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2616                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2617                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2618                                         pix2 = _mm_add_epi16(pix2,
2619                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2620                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2621                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2622                                 }
2623                                 if (x <= endsub)
2624                                 {
2625                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2626                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2627                                         tci = _mm_madd_epi16(tci, tcoffset);
2628                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2630                                                                                         _mm_setzero_si128());
2631                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2632                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2633                                                                                         _mm_setzero_si128());
2634                                         fracm = _mm_srli_epi16(subtc, 1);
2635                                         pix1 = _mm_add_epi16(pix1,
2636                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2637                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2638                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2639                                         pix1 = _mm_add_epi16(pix1,
2640                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2641                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2642                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2643                                         x++;
2644                                 }
2645                         }
2646                         else
2647                         {
2648                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2649                                 {
2650                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2651                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2652                                         tci = _mm_madd_epi16(tci, tcoffset);
2653                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2657                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2658                                                                                         _mm_setzero_si128());
2659                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2660                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2661                                         tci = _mm_madd_epi16(tci, tcoffset);
2662                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2666                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2667                                                                                         _mm_setzero_si128());
2668                                         fracm = _mm_srli_epi16(subtc, 1);
2669                                         pix1 = _mm_add_epi16(pix1,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2672                                         pix3 = _mm_add_epi16(pix3,
2673                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2674                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2675                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2676                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2677                                         pix2 = _mm_add_epi16(pix2,
2678                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2679                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2680                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2681                                 }
2682                                 if (x <= endsub)
2683                                 {
2684                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2685                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2686                                         tci = _mm_madd_epi16(tci, tcoffset);
2687                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         fracm = _mm_srli_epi16(subtc, 1);
2694                                         pix1 = _mm_add_epi16(pix1,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2696                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2697                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2698                                         pix1 = _mm_add_epi16(pix1,
2699                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2701                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2702                                         x++;
2703                                 }
2704                         }
2705                 }
2706                 else
2707                 {
2708                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2709                         {
2710                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2711                                 {
2712                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2713                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2714                                         tci = _mm_madd_epi16(tci, tcoffset);
2715                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2716                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2717                                 }
2718                                 if (x <= endsub)
2719                                 {
2720                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2721                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2722                                         tci = _mm_madd_epi16(tci, tcoffset);
2723                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2724                                         x++;
2725                                 }
2726                         }
2727                         else
2728                         {
2729                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2730                                 {
2731                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2732                                         tci = _mm_and_si128(tci, tcmax); 
2733                                         tci = _mm_madd_epi16(tci, tcoffset);
2734                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2735                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2736                                 }
2737                                 if (x <= endsub)
2738                                 {
2739                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2740                                         tci = _mm_and_si128(tci, tcmax); 
2741                                         tci = _mm_madd_epi16(tci, tcoffset);
2742                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2743                                         x++;
2744                                 }
2745                         }
2746                 }
2747         }
2748 #endif
2749 }
2750
2751 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2752 {
2753         // TODO: IMPLEMENT
2754         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2755 }
2756
2757 float DPSOFTRAST_SampleShadowmap(const float *vector)
2758 {
2759         // TODO: IMPLEMENT
2760         return 1.0f;
2761 }
2762
2763 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2764 {
2765         int x;
2766         int startx = span->startx;
2767         int endx = span->endx;
2768         float c[4];
2769         float data[4];
2770         float slope[4];
2771         float z;
2772         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2773         for (x = startx;x < endx;x++)
2774         {
2775                 z = zf[x];
2776                 c[0] = (data[0] + slope[0]*x) * z;
2777                 c[1] = (data[1] + slope[1]*x) * z;
2778                 c[2] = (data[2] + slope[2]*x) * z;
2779                 c[3] = (data[3] + slope[3]*x) * z;
2780                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2781                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2782                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2783                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2784         }
2785 }
2786
2787 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2788 {
2789         int x;
2790         int startx = span->startx;
2791         int endx = span->endx;
2792         float c[4];
2793         float data[4];
2794         float slope[4];
2795         float z;
2796         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2797         for (x = startx;x < endx;x++)
2798         {
2799                 z = zf[x];
2800                 c[0] = (data[0] + slope[0]*x) * z;
2801                 c[1] = (data[1] + slope[1]*x) * z;
2802                 c[2] = (data[2] + slope[2]*x) * z;
2803                 c[3] = (data[3] + slope[3]*x) * z;
2804                 out4f[x*4+0] = c[0];
2805                 out4f[x*4+1] = c[1];
2806                 out4f[x*4+2] = c[2];
2807                 out4f[x*4+3] = c[3];
2808         }
2809 }
2810
2811 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2812 {
2813         int x, startx = span->startx, endx = span->endx;
2814         float c[4], localcolor[4];
2815         localcolor[0] = subcolor[0];
2816         localcolor[1] = subcolor[1];
2817         localcolor[2] = subcolor[2];
2818         localcolor[3] = subcolor[3];
2819         for (x = startx;x < endx;x++)
2820         {
2821                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2822                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2823                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2824                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2825                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2826                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2827                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2828                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2829         }
2830 }
2831
2832 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2833 {
2834         int x, startx = span->startx, endx = span->endx;
2835         for (x = startx;x < endx;x++)
2836         {
2837                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2838                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2839                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2840                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2841         }
2842 }
2843
2844 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2845 {
2846         int x, startx = span->startx, endx = span->endx;
2847         for (x = startx;x < endx;x++)
2848         {
2849                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2850                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2851                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2852                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2853         }
2854 }
2855
2856 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 {
2858         int x, startx = span->startx, endx = span->endx;
2859         float a, b;
2860         for (x = startx;x < endx;x++)
2861         {
2862                 a = 1.0f - inb4f[x*4+3];
2863                 b = inb4f[x*4+3];
2864                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2865                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2866                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2867                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2868         }
2869 }
2870
2871 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2872 {
2873         int x, startx = span->startx, endx = span->endx;
2874         float localcolor[4], ilerp, lerp;
2875         localcolor[0] = color[0];
2876         localcolor[1] = color[1];
2877         localcolor[2] = color[2];
2878         localcolor[3] = color[3];
2879         ilerp = 1.0f - localcolor[3];
2880         lerp = localcolor[3];
2881         for (x = startx;x < endx;x++)
2882         {
2883                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2884                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2885                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2886                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2887         }
2888 }
2889
2890
2891
2892 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2893 {
2894 #ifdef SSE2_PRESENT
2895         int x;
2896         int startx = span->startx;
2897         int endx = span->endx;
2898         __m128 data, slope;
2899         __m128 mod, endmod;
2900         __m128i submod, substep, endsubmod;
2901         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2902         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2903         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2904         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2905         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2906         for (x = startx; x < endx;)
2907         {
2908                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2909                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2910                 if (nextsub >= endx)
2911                 {
2912                         nextsub = endsub = endx-1;
2913                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2914                 }
2915                 mod = endmod;
2916                 submod = endsubmod;
2917                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2918                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2919                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2920                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2921                 substep = _mm_packs_epi32(substep, substep);
2922                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2923                 {
2924                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2925                         pix = _mm_mulhi_epu16(pix, submod);
2926                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2927                 }
2928                 if (x <= endsub)
2929                 {
2930                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2931                         pix = _mm_mulhi_epu16(pix, submod);
2932                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2933                         x++;
2934                 }
2935         }
2936 #endif
2937 }
2938
2939 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2940 {
2941 #ifdef SSE2_PRESENT
2942         int x;
2943         int startx = span->startx;
2944         int endx = span->endx;
2945         __m128 data, slope;
2946         __m128 mod, endmod;
2947         __m128i submod, substep, endsubmod;
2948         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2949         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2950         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2951         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2952         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2953         for (x = startx; x < endx;)
2954         {
2955                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2956                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2957                 if (nextsub >= endx)
2958                 {
2959                         nextsub = endsub = endx-1;
2960                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2961                 }
2962                 mod = endmod;
2963                 submod = endsubmod;
2964                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2965                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2966                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2967                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2968                 substep = _mm_packs_epi32(substep, substep);
2969                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2970                 {
2971                         __m128i pix = _mm_srai_epi16(submod, 4);
2972                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2973                 }
2974                 if (x <= endsub)
2975                 {
2976                         __m128i pix = _mm_srai_epi16(submod, 4);
2977                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2978                         x++;
2979                 }
2980         }
2981 #endif
2982 }
2983
2984 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2985 {
2986 #ifdef SSE2_PRESENT
2987         int x, startx = span->startx, endx = span->endx;
2988         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2989         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2990         for (x = startx;x+2 <= endx;x+=2)
2991         {
2992                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2993                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2994                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2995                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2996         }
2997         if (x < endx)
2998         {
2999                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3000                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3001                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3002                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3003         }
3004 #endif
3005 }
3006
3007 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3008 {
3009 #ifdef SSE2_PRESENT
3010         int x, startx = span->startx, endx = span->endx;
3011         for (x = startx;x+2 <= endx;x+=2)
3012         {
3013                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3014                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3015                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3016                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3017         }
3018         if (x < endx)
3019         {
3020                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3021                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3022                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3024         }
3025 #endif
3026 }
3027
3028 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3029 {
3030 #ifdef SSE2_PRESENT
3031         int x, startx = span->startx, endx = span->endx;
3032         for (x = startx;x+2 <= endx;x+=2)
3033         {
3034                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3035                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3036                 pix1 = _mm_add_epi16(pix1, pix2);
3037                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3038         }
3039         if (x < endx)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3043                 pix1 = _mm_add_epi16(pix1, pix2);
3044                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3045         }
3046 #endif
3047 }
3048
3049 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3050 {
3051 #ifdef SSE2_PRESENT
3052         int x, startx = span->startx, endx = span->endx;
3053         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3054         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3055         for (x = startx;x+2 <= endx;x+=2)
3056         {
3057                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3058                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3059                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3060                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3061         }
3062         if (x < endx)
3063         {
3064                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3065                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3066                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3068         }
3069 #endif
3070 }
3071
3072 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3073 {
3074 #ifdef SSE2_PRESENT
3075         int x, startx = span->startx, endx = span->endx;
3076         for (x = startx;x+2 <= endx;x+=2)
3077         {
3078                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3079                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3080                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3081                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3082                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3083         }
3084         if (x < endx)
3085         {
3086                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3087                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3088                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3089                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3090                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3091         }
3092 #endif
3093 }
3094
3095 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3096 {
3097 #ifdef SSE2_PRESENT
3098         int x, startx = span->startx, endx = span->endx;
3099         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3100         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3101         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3102         for (x = startx;x+2 <= endx;x+=2)
3103         {
3104                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3105                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3106                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3107         }
3108         if (x < endx)
3109         {
3110                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3111                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3113         }
3114 #endif
3115 }
3116
3117
3118
3119 void DPSOFTRAST_VertexShader_Generic(void)
3120 {
3121         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3122         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3123         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3124         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3125                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3126 }
3127
3128 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3129 {
3130         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3131         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3134         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3135         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3136         {
3137                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3138                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3139                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3140                 {
3141                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3142                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3143                         {
3144                                 // multiply
3145                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3146                         }
3147                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3148                         {
3149                                 // add
3150                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3151                         }
3152                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3153                         {
3154                                 // alphablend
3155                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3156                         }
3157                 }
3158         }
3159         else
3160                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3161         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3162 }
3163
3164
3165
3166 void DPSOFTRAST_VertexShader_PostProcess(void)
3167 {
3168         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3169         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3170         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3171 }
3172
3173 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3174 {
3175         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3176         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3177         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3179         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3180         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3181         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3182         {
3183                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3184                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3185         }
3186         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3187         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3188         {
3189                 // TODO: implement saturation
3190         }
3191         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3192         {
3193                 // TODO: implement gammaramps
3194         }
3195         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3196 }
3197
3198
3199
3200 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3201 {
3202         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3203 }
3204
3205 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 {
3207         // this is never called (because colormask is off when this shader is used)
3208         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3209         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3211         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3212         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3213 }
3214
3215
3216
3217 void DPSOFTRAST_VertexShader_FlatColor(void)
3218 {
3219         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3220         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3221 }
3222
3223 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3224 {
3225 #ifdef SSE2_PRESENT
3226         unsigned char * RESTRICT pixelmask = span->pixelmask;
3227         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3228         int x, startx = span->startx, endx = span->endx;
3229         __m128i Color_Ambientm;
3230         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3231         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3234         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3235         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3236                 pixel = buffer_FragColorbgra8;
3237         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3238         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3239         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3240         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3241         for (x = startx;x < endx;x++)
3242         {
3243                 __m128i color, pix;
3244                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3245                 {
3246                         __m128i pix2;
3247                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3248                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3249                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3250                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3251                         x += 3;
3252                         continue;
3253                 }
3254                 if (!pixelmask[x])
3255                         continue;
3256                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3257                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3258                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3259         }
3260         if (pixel == buffer_FragColorbgra8)
3261                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3262 #endif
3263 }
3264
3265
3266
3267 void DPSOFTRAST_VertexShader_VertexColor(void)
3268 {
3269         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3271         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3272 }
3273
3274 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3275 {
3276 #ifdef SSE2_PRESENT
3277         unsigned char * RESTRICT pixelmask = span->pixelmask;
3278         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3279         int x, startx = span->startx, endx = span->endx;
3280         __m128i Color_Ambientm, Color_Diffusem;
3281         __m128 data, slope;
3282         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3286         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3287         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3288         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3289                 pixel = buffer_FragColorbgra8;
3290         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3291         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3292         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3293         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3294         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3295         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3296         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3297         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3298         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3299         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3300         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3301         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3302         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3303         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3304         {
3305                 __m128i color, mod, pix;
3306                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3307                 {
3308                         __m128i pix2, mod2;
3309                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3310                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3311                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3312                         data = _mm_add_ps(data, slope);
3313                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3314                         data = _mm_add_ps(data, slope);
3315                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3316                         data = _mm_add_ps(data, slope);
3317                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3318                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3319                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3320                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3321                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3322                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3323                         x += 3;
3324                         continue;
3325                 }
3326                 if (!pixelmask[x])
3327                         continue;
3328                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3329                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3330                 mod = _mm_packs_epi32(mod, mod);
3331                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3332                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3333         }
3334         if (pixel == buffer_FragColorbgra8)
3335                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3336 #endif
3337 }
3338
3339
3340
3341 void DPSOFTRAST_VertexShader_Lightmap(void)
3342 {
3343         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3345         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3346 }
3347
3348 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3349 {
3350 #ifdef SSE2_PRESENT
3351         unsigned char * RESTRICT pixelmask = span->pixelmask;
3352         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3353         int x, startx = span->startx, endx = span->endx;
3354         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3355         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3356         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3360         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3361         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3362         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3363         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3364                 pixel = buffer_FragColorbgra8;
3365         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3366         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3367         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3368         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3369         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3370         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3371         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3372         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3373         {
3374                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3375                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3378                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3379                 for (x = startx;x < endx;x++)
3380                 {
3381                         __m128i color, lightmap, glow, pix;
3382                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3383                         {
3384                                 __m128i pix2;
3385                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3386                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3387                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3388                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3389                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3390                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3391                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3392                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3393                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3394                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3395                                 x += 3;
3396                                 continue;
3397                         }
3398                         if (!pixelmask[x])
3399                                 continue;
3400                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3401                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3402                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3403                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3404                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3405                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3406                 }
3407         }
3408         else
3409         {
3410                 for (x = startx;x < endx;x++)
3411                 {
3412                         __m128i color, lightmap, pix;
3413                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3414                         {
3415                                 __m128i pix2;
3416                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3417                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3418                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3419                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3420                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3421                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3422                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3423                                 x += 3;
3424                                 continue;
3425                         }
3426                         if (!pixelmask[x]) 
3427                                 continue;
3428                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3429                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3430                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3431                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3432                 }
3433         }
3434         if (pixel == buffer_FragColorbgra8)
3435                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3436 #endif
3437 }
3438
3439
3440
3441 void DPSOFTRAST_VertexShader_FakeLight(void)
3442 {
3443         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3444 }
3445
3446 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3447 {
3448         // TODO: IMPLEMENT
3449         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3450         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3451         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3452         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3453         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3454 }
3455
3456
3457
3458 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3459 {
3460         DPSOFTRAST_VertexShader_Lightmap();
3461 }
3462
3463 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3464 {
3465         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3466         // TODO: IMPLEMENT
3467 }
3468
3469
3470
3471 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3472 {
3473         DPSOFTRAST_VertexShader_Lightmap();
3474 }
3475
3476 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3477 {
3478         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3479         // TODO: IMPLEMENT
3480 }
3481
3482
3483
3484 void DPSOFTRAST_VertexShader_LightDirection(void)
3485 {
3486         int i;
3487         int numvertices = dpsoftrast.numvertices;
3488         float LightDir[4];
3489         float LightVector[4];
3490         float EyePosition[4];
3491         float EyeVectorModelSpace[4];
3492         float EyeVector[4];
3493         float position[4];
3494         float svector[4];
3495         float tvector[4];
3496         float normal[4];
3497         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3498         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3499         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3500         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3501         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3502         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3503         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3504         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3505         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3506         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3507         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3508         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3509         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3510         for (i = 0;i < numvertices;i++)
3511         {
3512                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3513                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3514                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3515                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3516                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3517                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3518                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3519                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3520                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3521                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3522                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3523                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3524                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3525                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3526                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3527                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3528                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3529                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3530                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3531                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3532                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3533                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3534                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3535                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3536                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3538                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3539                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3540                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3541         }
3542         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3543 }
3544
3545 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3546 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3547 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3548 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3549 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3550 #define DPSOFTRAST_Vector3Normalize(v)\
3551 do\
3552 {\
3553         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3554         if (len)\
3555         {\
3556                 len = 1.0f / len;\
3557                 v[0] *= len;\
3558                 v[1] *= len;\
3559                 v[2] *= len;\
3560         }\
3561 }\
3562 while(0)
3563
3564 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3565 {
3566         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3567         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574         int x, startx = span->startx, endx = span->endx;
3575         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3576         float LightVectordata[4];
3577         float LightVectorslope[4];
3578         float EyeVectordata[4];
3579         float EyeVectorslope[4];
3580         float z;
3581         float diffusetex[4];
3582         float glosstex[4];
3583         float surfacenormal[4];
3584         float lightnormal[4];
3585         float eyenormal[4];
3586         float specularnormal[4];
3587         float diffuse;
3588         float specular;
3589         float SpecularPower;
3590         int d[4];
3591         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3592         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3593         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3594         Color_Glow[3] = 0.0f;
3595         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3596         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3597         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3598         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3599         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3600         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3601         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3602         Color_Pants[3] = 0.0f;
3603         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3604         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3605         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3606         Color_Shirt[3] = 0.0f;
3607         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3608         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3610         {
3611                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3612                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3613         }
3614         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3615         {
3616                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3617         }
3618         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3619         {
3620                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3621                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3622                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3623                 Color_Diffuse[3] = 0.0f;
3624                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3625                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3626                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3627                 LightColor[3] = 0.0f;
3628                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3629                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3631                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3632                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3633                 Color_Specular[3] = 0.0f;
3634                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3635                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3636                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3637                 for (x = startx;x < endx;x++)
3638                 {
3639                         z = buffer_z[x];
3640                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3641                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3642                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3643                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3644                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3645                         {
3646                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3647                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3648                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3649                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3650                         }
3651                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3652                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3653                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3654                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3655                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3656                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3657                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3658                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3659
3660                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3661                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3662                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3663                         DPSOFTRAST_Vector3Normalize(lightnormal);
3664
3665                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3666                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3667                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3668                         DPSOFTRAST_Vector3Normalize(eyenormal);
3669
3670                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3671                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3672                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3673                         DPSOFTRAST_Vector3Normalize(specularnormal);
3674
3675                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3676                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3677                         specular = pow(specular, SpecularPower * glosstex[3]);
3678                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3679                         {
3680                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3681                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3682                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3683                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3684                         }
3685                         else
3686                         {
3687                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3688                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3689                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3690                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3691                         }
3692                         buffer_FragColorbgra8[x*4+0] = d[0];
3693                         buffer_FragColorbgra8[x*4+1] = d[1];
3694                         buffer_FragColorbgra8[x*4+2] = d[2];
3695                         buffer_FragColorbgra8[x*4+3] = d[3];
3696                 }
3697         }
3698         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3699         {
3700                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3701                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3702                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3703                 Color_Diffuse[3] = 0.0f;
3704                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3705                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3706                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3707                 LightColor[3] = 0.0f;
3708                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3709                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3710                 for (x = startx;x < endx;x++)
3711                 {
3712                         z = buffer_z[x];
3713                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3714                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3715                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3716                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3717                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3718                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3719                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3720                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3721
3722                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3723                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3724                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3725                         DPSOFTRAST_Vector3Normalize(lightnormal);
3726
3727                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3728                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3729                         {
3730                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3731                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3732                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3733                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3734                         }
3735                         else
3736                         {
3737                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3738                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3739                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3740                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3741                         }
3742                         buffer_FragColorbgra8[x*4+0] = d[0];
3743                         buffer_FragColorbgra8[x*4+1] = d[1];
3744                         buffer_FragColorbgra8[x*4+2] = d[2];
3745                         buffer_FragColorbgra8[x*4+3] = d[3];
3746                 }
3747         }
3748         else
3749         {
3750                 for (x = startx;x < endx;x++)
3751                 {
3752                         z = buffer_z[x];
3753                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3754                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3755                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3756                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3757
3758                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3759                         {
3760                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3761                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3762                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3763                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3764                         }
3765                         else
3766                         {
3767                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3768                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3769                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3770                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3771                         }
3772                         buffer_FragColorbgra8[x*4+0] = d[0];
3773                         buffer_FragColorbgra8[x*4+1] = d[1];
3774                         buffer_FragColorbgra8[x*4+2] = d[2];
3775                         buffer_FragColorbgra8[x*4+3] = d[3];
3776                 }
3777         }
3778         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3779 }
3780
3781
3782
3783 void DPSOFTRAST_VertexShader_LightSource(void)
3784 {
3785         int i;
3786         int numvertices = dpsoftrast.numvertices;
3787         float LightPosition[4];
3788         float LightVector[4];
3789         float LightVectorModelSpace[4];
3790         float EyePosition[4];
3791         float EyeVectorModelSpace[4];
3792         float EyeVector[4];
3793         float position[4];
3794         float svector[4];
3795         float tvector[4];
3796         float normal[4];
3797         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3798         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3799         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3800         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3801         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3802         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3803         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3804         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3805         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3806         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3807         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3808         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3809         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3810         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3811         for (i = 0;i < numvertices;i++)
3812         {
3813                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3814                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3815                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3816                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3817                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3818                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3819                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3820                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3821                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3822                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3823                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3824                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3825                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3826                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3827                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3828                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3829                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3830                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3831                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3832                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3833                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3834                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3835                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3836                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3837                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3838                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3839                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3840                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3841                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3842                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3843                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3844                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3845         }
3846         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3847         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3848 }
3849
3850 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3851 {
3852 #ifdef SSE2_PRESENT
3853         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3854         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3859         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3860         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3861         int x, startx = span->startx, endx = span->endx;
3862         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3863         float CubeVectordata[4];
3864         float CubeVectorslope[4];
3865         float LightVectordata[4];
3866         float LightVectorslope[4];
3867         float EyeVectordata[4];
3868         float EyeVectorslope[4];
3869         float z;
3870         float diffusetex[4];
3871         float glosstex[4];
3872         float surfacenormal[4];
3873         float lightnormal[4];
3874         float eyenormal[4];
3875         float specularnormal[4];
3876         float diffuse;
3877         float specular;
3878         float SpecularPower;
3879         float CubeVector[4];
3880         float attenuation;
3881         int d[4];
3882         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3883         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3884         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3885         Color_Glow[3] = 0.0f;
3886         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3887         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3888         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3889         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3890         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3891         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3892         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3893         Color_Diffuse[3] = 0.0f;
3894         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3895         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3896         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3897         Color_Specular[3] = 0.0f;
3898         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3899         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3900         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3901         Color_Pants[3] = 0.0f;
3902         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3903         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3904         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3905         Color_Shirt[3] = 0.0f;
3906         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3907         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3908         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3909         LightColor[3] = 0.0f;
3910         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3911         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3912         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3913         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3914         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3915         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3916         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3918         {
3919                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3920                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3921         }
3922         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3923                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3924         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3925         {
3926                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3928                 for (x = startx;x < endx;x++)
3929                 {
3930                         z = buffer_z[x];
3931                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3932                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3933                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3934                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3935                         if (attenuation < 0.01f)
3936                                 continue;
3937                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3938                         {
3939                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3940                                 if (attenuation < 0.01f)
3941                                         continue;
3942                         }
3943
3944                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3945                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3946                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3947                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3948                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3949                         {
3950                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3951                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3952                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3953                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3954                         }
3955                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3956                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3957                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3958                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3959                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3962                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3963
3964                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3965                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3966                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3967                         DPSOFTRAST_Vector3Normalize(lightnormal);
3968
3969                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3970                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3971                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3972                         DPSOFTRAST_Vector3Normalize(eyenormal);
3973
3974                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3975                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3976                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3977                         DPSOFTRAST_Vector3Normalize(specularnormal);
3978
3979                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3980                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3981                         specular = pow(specular, SpecularPower * glosstex[3]);
3982                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3983                         {
3984                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3985                                 attenuation *= (1.0f / 255.0f);
3986                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3987                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3988                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3989                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3990                         }
3991                         else
3992                         {
3993                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3994                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3995                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3996                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3997                         }
3998                         buffer_FragColorbgra8[x*4+0] = d[0];
3999                         buffer_FragColorbgra8[x*4+1] = d[1];
4000                         buffer_FragColorbgra8[x*4+2] = d[2];
4001                         buffer_FragColorbgra8[x*4+3] = d[3];
4002                 }
4003         }
4004         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4005         {
4006                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4007                 for (x = startx;x < endx;x++)
4008                 {
4009                         z = buffer_z[x];
4010                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4011                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4012                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4013                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4014                         if (attenuation < 0.01f)
4015                                 continue;
4016                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4017                         {
4018                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4019                                 if (attenuation < 0.01f)
4020                                         continue;
4021                         }
4022
4023                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4024                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4025                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4026                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4027                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4028                         {
4029                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4030                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4031                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4032                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4033                         }
4034                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4035                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4036                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4037                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4038
4039                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4040                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4041                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4042                         DPSOFTRAST_Vector3Normalize(lightnormal);
4043
4044                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4045                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4046                         {
4047                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4048                                 attenuation *= (1.0f / 255.0f);
4049                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4050                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4051                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4052                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4053                         }
4054                         else
4055                         {
4056                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4057                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4058                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4059                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4060                         }
4061                         buffer_FragColorbgra8[x*4+0] = d[0];
4062                         buffer_FragColorbgra8[x*4+1] = d[1];
4063                         buffer_FragColorbgra8[x*4+2] = d[2];
4064                         buffer_FragColorbgra8[x*4+3] = d[3];
4065                 }
4066         }
4067         else
4068         {
4069                 for (x = startx;x < endx;x++)
4070                 {
4071                         z = buffer_z[x];
4072                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4073                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4074                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4075                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4076                         if (attenuation < 0.01f)
4077                                 continue;
4078                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4079                         {
4080                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4081                                 if (attenuation < 0.01f)
4082                                         continue;
4083                         }
4084
4085                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4086                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4087                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4088                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4089                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4090                         {
4091                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4092                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4093                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4094                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4095                         }
4096                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4097                         {
4098                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4099                                 attenuation *= (1.0f / 255.0f);
4100                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4101                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4102                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4103                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4104                         }
4105                         else
4106                         {
4107                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4108                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4109                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4110                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4111                         }
4112                         buffer_FragColorbgra8[x*4+0] = d[0];
4113                         buffer_FragColorbgra8[x*4+1] = d[1];
4114                         buffer_FragColorbgra8[x*4+2] = d[2];
4115                         buffer_FragColorbgra8[x*4+3] = d[3];
4116                 }
4117         }
4118         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4119 #endif
4120 }
4121
4122
4123
4124 void DPSOFTRAST_VertexShader_Refraction(void)
4125 {
4126         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4127 }
4128
4129 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4130 {
4131         // TODO: IMPLEMENT
4132         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4133         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4134         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4135         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4136         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4137 }
4138
4139
4140
4141 void DPSOFTRAST_VertexShader_Water(void)
4142 {
4143         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144 }
4145
4146
4147 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4148 {
4149         // TODO: IMPLEMENT
4150         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4151         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4153         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4154         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4155 }
4156
4157
4158
4159 void DPSOFTRAST_VertexShader_ShowDepth(void)
4160 {
4161         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4162 }
4163
4164 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4165 {
4166         // TODO: IMPLEMENT
4167         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4168         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4169         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4170         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4171         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4172 }
4173
4174
4175
4176 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4177 {
4178         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4179 }
4180
4181 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4182 {
4183         // TODO: IMPLEMENT
4184         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4185         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4186         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4187         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4188         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4189 }
4190
4191
4192
4193 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4194 {
4195         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4196 }
4197
4198 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4199 {
4200         // TODO: IMPLEMENT
4201         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4202         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4203         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4204         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4205         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4206 }
4207
4208
4209
4210 typedef struct DPSOFTRAST_ShaderModeInfo_s
4211 {
4212         int lodarrayindex;
4213         void (*Vertex)(void);
4214         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4215         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4216         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4217 }
4218 DPSOFTRAST_ShaderModeInfo;
4219
4220 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4221 {
4222         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4223         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4224         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4225         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4226         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4227         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4228         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4229         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4230         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4231         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4232         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4233         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4234         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4235         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4236         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4237         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4238 };
4239
4240 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4241 {
4242         int i;
4243         int x;
4244         int startx;
4245         int endx;
4246 //      unsigned int c;
4247 //      unsigned int *colorpixel;
4248         unsigned int *depthpixel;
4249         float w;
4250         float wslope;
4251         int depth;
4252         int depthslope;
4253         unsigned int d;
4254         DPSOFTRAST_State_Triangle *triangle;
4255         DPSOFTRAST_State_Span *span;
4256         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4257         for (i = 0; i < thread->numspans; i++)
4258         {
4259                 span = &thread->spans[i];
4260                 triangle = &thread->triangles[span->triangle];
4261                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4262                 {
4263                         wslope = triangle->w[0];
4264                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4265                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4266                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4267                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4268                         startx = span->startx;
4269                         endx = span->endx;
4270                         switch(thread->fb_depthfunc)
4271                         {
4272                         default:
4273                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4274                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4275                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4276                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4277                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4278                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4279                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4280                         }
4281                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4282                         //for (x = startx;x < endx;x++)
4283                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4284                         // if there is no color buffer, skip pixel shader
4285                         while (startx < endx && !pixelmask[startx])
4286                                 startx++;
4287                         while (endx > startx && !pixelmask[endx-1])
4288                                 endx--;
4289                         if (startx >= endx)
4290                                 continue; // no pixels to fill
4291                         span->pixelmask = pixelmask;
4292                         span->startx = startx;
4293                         span->endx = endx;
4294                         // run pixel shader if appropriate
4295                         // do this before running depthmask code, to allow the pixelshader
4296                         // to clear pixelmask values for alpha testing
4297                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4298                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4299                         if (thread->depthmask)
4300                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4301                                         if (pixelmask[x])
4302                                                 depthpixel[x] = d;
4303                 }
4304                 else
4305                 {
4306                         // no depth testing means we're just dealing with color...
4307                         // if there is no color buffer, skip pixel shader
4308                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4309                         {
4310                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4311                                 span->pixelmask = pixelmask;
4312                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4313                         }
4314                 }
4315         }
4316         thread->numspans = 0;
4317 }
4318
4319 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4320
4321 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4322 {
4323 #ifdef SSE2_PRESENT
4324         int cullface = thread->cullface;
4325         int minx, maxx, miny, maxy;
4326         int miny1, maxy1, miny2, maxy2;
4327         __m128i fbmin, fbmax;
4328         __m128 viewportcenter, viewportscale;
4329         int firstvertex = command->firstvertex;
4330         int numvertices = command->numvertices;
4331         int numtriangles = command->numtriangles;
4332         const int *element3i = command->element3i;
4333         const unsigned short *element3s = command->element3s;
4334         int clipped = command->clipped;
4335         int i;
4336         int j;
4337         int k;
4338         int y;
4339         int e[3];
4340         __m128i screeny;
4341         int starty, endy, bandy;
4342         int numpoints;
4343         int clipcase;
4344         float clipdist[4];
4345         __m128 triangleedge1, triangleedge2, trianglenormal;
4346         __m128 clipfrac[3];
4347         __m128 screen[4];
4348         DPSOFTRAST_State_Triangle *triangle;
4349         DPSOFTRAST_Texture *texture;
4350         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4351         miny = thread->fb_scissor[1];
4352         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4353         miny1 = bound(miny, thread->miny1, maxy);
4354         maxy1 = bound(miny, thread->maxy1, maxy);
4355         miny2 = bound(miny, thread->miny2, maxy);
4356         maxy2 = bound(miny, thread->maxy2, maxy);
4357         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4358         {
4359                 if (!ATOMIC_DECREMENT(command->refcount))
4360                 {
4361                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4362                                 MM_FREE(command->arrays);
4363                 }
4364                 return;
4365         }
4366         minx = thread->fb_scissor[0];
4367         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4368         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4369         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4370         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4371         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4372         screen[3] = _mm_setzero_ps();
4373         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4374         for (i = 0;i < numtriangles;i++)
4375         {
4376                 const float *screencoord4f = command->arrays;
4377                 const float *arrays = screencoord4f + numvertices*4;
4378
4379                 // generate the 3 edges of this triangle
4380                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4381                 if (element3s)
4382                 {
4383                         e[0] = element3s[i*3+0] - firstvertex;
4384                         e[1] = element3s[i*3+1] - firstvertex;
4385                         e[2] = element3s[i*3+2] - firstvertex;
4386                 }
4387                 else if (element3i)
4388                 {
4389                         e[0] = element3i[i*3+0] - firstvertex;
4390                         e[1] = element3i[i*3+1] - firstvertex;
4391                         e[2] = element3i[i*3+2] - firstvertex;
4392                 }
4393                 else
4394                 {
4395                         e[0] = i*3+0;
4396                         e[1] = i*3+1;
4397                         e[2] = i*3+2;
4398                 }
4399
4400 #define SKIPBACKFACE \
4401                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4402                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4403                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4404                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4405                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4406                 switch(cullface) \
4407                 { \
4408                 case GL_BACK: \
4409                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4410                                 continue; \
4411                         break; \
4412                 case GL_FRONT: \
4413                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4414                                 continue; \
4415                         break; \
4416                 }
4417
4418 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4419                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4420                         { \
4421                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4422                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4423                         }
4424 #define CLIPPEDVERTEXCOPY(k,p1) \
4425                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4426
4427 #define GENATTRIBCOPY(attrib, p1) \
4428                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4429 #define GENATTRIBLERP(attrib, p1, p2) \
4430                 { \
4431                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4432                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4433                 }
4434 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4435                 switch(clipcase) \
4436                 { \
4437                 default: \
4438                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4439                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4440                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4441                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4442                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4443                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4444                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4445                 }
4446
4447                 if (! clipped)
4448                         goto notclipped;
4449
4450                 // calculate distance from nearplane
4451                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4452                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4453                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4454                 if (clipdist[0] >= 0.0f)
4455                 {
4456                         if (clipdist[1] >= 0.0f)
4457                         {
4458                                 if (clipdist[2] >= 0.0f)
4459                                 {
4460                                 notclipped:
4461                                         // triangle is entirely in front of nearplane
4462                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4463                                         SKIPBACKFACE;
4464                                         numpoints = 3;
4465                                         clipcase = 0;
4466                                 }
4467                                 else
4468                                 {
4469                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4470                                         SKIPBACKFACE;
4471                                         numpoints = 4;
4472                                         clipcase = 1;
4473                                 }
4474                         }
4475                         else
4476                         {
4477                                 if (clipdist[2] >= 0.0f)
4478                                 {
4479                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4480                                         SKIPBACKFACE;
4481                                         numpoints = 4;
4482                                         clipcase = 2;
4483                                 }
4484                                 else
4485                                 {
4486                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4487                                         SKIPBACKFACE;
4488                                         numpoints = 3;
4489                                         clipcase = 3;
4490                                 }
4491                         }
4492                 }
4493                 else if (clipdist[1] >= 0.0f)
4494                 {
4495                         if (clipdist[2] >= 0.0f)
4496                         {
4497                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4498                                 SKIPBACKFACE;
4499                                 numpoints = 4;
4500                                 clipcase = 4;
4501                         }
4502                         else
4503                         {
4504                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4505                                 SKIPBACKFACE;
4506                                 numpoints = 3;
4507                                 clipcase = 5;
4508                         }
4509                 }
4510                 else if (clipdist[2] >= 0.0f)
4511                 {
4512                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4513                         SKIPBACKFACE;
4514                         numpoints = 3;
4515                         clipcase = 6;
4516                 }
4517                 else continue; // triangle is entirely behind nearplane
4518
4519                 {
4520                         // calculate integer y coords for triangle points
4521                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4522                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4523                                         screenmin = _mm_min_epi16(screeni, screenir),
4524                                         screenmax = _mm_max_epi16(screeni, screenir);
4525                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4526                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4527                         screenmin = _mm_max_epi16(screenmin, fbmin);
4528                         screenmax = _mm_min_epi16(screenmax, fbmax);
4529                         // skip offscreen triangles
4530                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4531                                 continue;
4532                         starty = _mm_extract_epi16(screenmin, 1);
4533                         endy = _mm_extract_epi16(screenmax, 1)+1;
4534                         if (starty >= maxy1 && endy <= miny2)
4535                                 continue;
4536                         screeny = _mm_srai_epi32(screeni, 16);
4537                 }
4538
4539                 triangle = &thread->triangles[thread->numtriangles];
4540
4541                 // calculate attribute plans for triangle data...
4542                 // okay, this triangle is going to produce spans, we'd better project
4543                 // the interpolants now (this is what gives perspective texturing),
4544                 // this consists of simply multiplying all arrays by the W coord
4545                 // (which is basically 1/Z), which will be undone per-pixel
4546                 // (multiplying by Z again) to get the perspective-correct array
4547                 // values
4548                 {
4549                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4550                         __m128 mipedgescale, mipdensity;
4551                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4552                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4553                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4554                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4555                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4556                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4557                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4558                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4559                         attribedge1 = _mm_sub_ss(w0, w1);
4560                         attribedge2 = _mm_sub_ss(w2, w1);
4561                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4562                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4563                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4564                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4565                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4566                         _mm_store_ss(&triangle->w[0], attribxslope);
4567                         _mm_store_ss(&triangle->w[1], attribyslope);
4568                         _mm_store_ss(&triangle->w[2], attriborigin);
4569                         mipedgescale = _mm_setzero_ps();
4570                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4571                         {
4572                                 __m128 attrib0, attrib1, attrib2;
4573                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4574                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4575                                         break;
4576                                 arrays += numvertices*4;
4577                                 GENATTRIBS(attrib0, attrib1, attrib2);
4578                                 attriborigin = _mm_mul_ps(attrib1, w1);
4579                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4580                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4581                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4582                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4583                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4584                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4585                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4586                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4587                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4588                                 {
4589                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4590                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4591                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4592                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4593                                 }
4594                         }
4595
4596                         memset(triangle->mip, 0, sizeof(triangle->mip));
4597                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4598                         {
4599                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4600                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4601                                         break;
4602                                 texture = thread->texbound[texunit];
4603                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4604                                 {
4605                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4606                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4607                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4608                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4609                                         // this will be multiplied in the texturing routine by the texture resolution
4610                                         y = _mm_cvtss_si32(mipdensity);
4611                                         if (y > 0)
4612                                         {
4613                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4614                                                 if (y > texture->mipmaps - 1)
4615                                                         y = texture->mipmaps - 1;
4616                                                 triangle->mip[texunit] = y;
4617                                         }
4618                                 }
4619                         }
4620                 }
4621         
4622                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4623                 for (; y < bandy;)
4624                 {
4625                         __m128 xcoords, xslope;
4626                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4627                         int yccmask = _mm_movemask_epi8(ycc);
4628                         int edge0p, edge0n, edge1p, edge1n;
4629                         int nexty;
4630                         if (numpoints == 4)
4631                         {
4632                                 switch(yccmask)
4633                                 {
4634                                 default:
4635                                 case 0xFFFF: /*0000*/ y = endy; continue;
4636                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4637                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4638                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4639                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4640                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4641                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4642                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4643                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4644                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4645                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4646                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4647                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4648                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4649                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4650                                 case 0x0000: /*1111*/ y++; continue;
4651                                 }
4652                         }
4653                         else
4654                         {
4655                                 switch(yccmask)
4656                                 {
4657                                 default:
4658                                 case 0xFFFF: /*000*/ y = endy; continue;
4659                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4660                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4661                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4662                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4663                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4664                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4665                                 case 0x0000: /*111*/ y++; continue;
4666                                 }
4667                         }
4668                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4669                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4670                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4671                         nexty = _mm_extract_epi16(ycc, 0);
4672                         if (nexty >= bandy) nexty = bandy-1;
4673                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4674                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4675                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4676                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4677                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4678                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4679                         {
4680                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4681                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4682                         }
4683                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4684                         {
4685                                 int startx, endx, offset;
4686                                 startx = _mm_cvtss_si32(xcoords);
4687                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4688                                 if (startx < minx) 
4689                                 {
4690                                         if (startx < 0) startx = 0;
4691                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4692                                 }
4693                                 if (endx > maxx) endx = maxx;
4694                                 if (startx >= endx) continue;
4695                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4696                                 {
4697                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4698                                         span->triangle = thread->numtriangles;
4699                                         span->x = offset;
4700                                         span->y = y;
4701                                         span->startx = max(minx - offset, 0);
4702                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4703                                         if (span->startx >= span->endx)
4704                                                 continue; 
4705                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4706                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4707                                 }
4708                         }
4709                 }
4710
4711                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4712                 {
4713                         DPSOFTRAST_Draw_ProcessSpans(thread);
4714                         thread->numtriangles = 0;
4715                 }
4716         }
4717
4718         if (!ATOMIC_DECREMENT(command->refcount))
4719         {
4720                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721                         MM_FREE(command->arrays);
4722         }
4723
4724         if (thread->numspans > 0 || thread->numtriangles > 0)
4725         {
4726                 DPSOFTRAST_Draw_ProcessSpans(thread);
4727                 thread->numtriangles = 0;
4728         }
4729 #endif
4730 }
4731
4732 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4733 {
4734         int i;
4735         int j;
4736         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4737         int datasize = 2*numvertices*sizeof(float[4]);
4738         DPSOFTRAST_Command_Draw *command;
4739         unsigned char *data;
4740         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4741         {
4742                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4743                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4744                         break;
4745                 datasize += numvertices*sizeof(float[4]);
4746         }
4747         if (element3s)
4748                 datasize += numtriangles*sizeof(unsigned short[3]);
4749         else if (element3i)
4750                 datasize += numtriangles*sizeof(int[3]);
4751         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4752         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4753         {
4754                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4755                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4756         }
4757         else
4758         {
4759                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4760                 data = (unsigned char *)command + commandsize;
4761         }
4762         command->firstvertex = firstvertex;
4763         command->numvertices = numvertices;
4764         command->numtriangles = numtriangles;
4765         command->arrays = (float *)data;
4766         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4767         dpsoftrast.firstvertex = firstvertex;
4768         dpsoftrast.numvertices = numvertices;
4769         dpsoftrast.screencoord4f = (float *)data;
4770         data += numvertices*sizeof(float[4]);
4771         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4772         data += numvertices*sizeof(float[4]);
4773         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4774         {
4775                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4776                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4777                         break;
4778                 dpsoftrast.post_array4f[j] = (float *)data;
4779                 data += numvertices*sizeof(float[4]);
4780         }
4781         command->element3i = NULL;
4782         command->element3s = NULL;
4783         if (element3s)
4784         {
4785                 command->element3s = (unsigned short *)data;
4786                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4787         }
4788         else if (element3i)
4789         {
4790                 command->element3i = (int *)data;
4791                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4792         }
4793         return command;
4794 }
4795
4796 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4797 {
4798         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4799         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4800         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4801         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4802         if (command->starty >= command->endy)
4803         {
4804                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4805                         MM_FREE(command->arrays);
4806                 DPSOFTRAST_UndoCommand(command->commandsize);
4807                 return;
4808         }
4809         command->clipped = dpsoftrast.drawclipped;
4810         command->refcount = dpsoftrast.numthreads;
4811
4812         if (dpsoftrast.usethreads)
4813         {
4814                 int i;
4815                 DPSOFTRAST_Draw_SyncCommands();
4816                 for (i = 0; i < dpsoftrast.numthreads; i++)
4817                 {
4818                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4819                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4820                                 Thread_CondSignal(thread->drawcond);
4821                 }
4822         }
4823         else
4824         {
4825                 DPSOFTRAST_Draw_FlushThreads();
4826         }
4827 }
4828  
4829 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4830 {
4831         int commandoffset = thread->commandoffset;
4832         while (commandoffset != endoffset)
4833         {
4834                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4835                 switch (command->opcode)
4836                 {
4837 #define INTERPCOMMAND(name) \
4838                 case DPSOFTRAST_OPCODE_##name : \
4839                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4840                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4841                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4842                                 commandoffset = 0; \
4843                         break;
4844                 INTERPCOMMAND(Viewport)
4845                 INTERPCOMMAND(ClearColor)
4846                 INTERPCOMMAND(ClearDepth)
4847                 INTERPCOMMAND(ColorMask)
4848                 INTERPCOMMAND(DepthTest)
4849                 INTERPCOMMAND(ScissorTest)
4850                 INTERPCOMMAND(Scissor)
4851                 INTERPCOMMAND(BlendFunc)
4852                 INTERPCOMMAND(BlendSubtract)
4853                 INTERPCOMMAND(DepthMask)
4854                 INTERPCOMMAND(DepthFunc)
4855                 INTERPCOMMAND(DepthRange)
4856                 INTERPCOMMAND(PolygonOffset)
4857                 INTERPCOMMAND(CullFace)
4858                 INTERPCOMMAND(AlphaTest)
4859                 INTERPCOMMAND(AlphaFunc)
4860                 INTERPCOMMAND(SetTexture)
4861                 INTERPCOMMAND(SetShader)
4862                 INTERPCOMMAND(Uniform4f)
4863                 INTERPCOMMAND(UniformMatrix4f)
4864                 INTERPCOMMAND(Uniform1i)
4865
4866                 case DPSOFTRAST_OPCODE_Draw:
4867                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4868                         commandoffset += command->commandsize;
4869                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4870                                 commandoffset = 0;
4871                         thread->commandoffset = commandoffset;
4872                         break;
4873
4874                 case DPSOFTRAST_OPCODE_Reset:
4875                         commandoffset = 0;
4876                         break;
4877                 }
4878         }
4879         thread->commandoffset = commandoffset;
4880 }
4881
4882 static int DPSOFTRAST_Draw_Thread(void *data)
4883 {
4884         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4885         while(thread->index >= 0)
4886         {
4887                 if (thread->commandoffset != dpsoftrast.drawcommand)
4888                 {
4889                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4890                 }
4891                 else 
4892                 {
4893                         Thread_LockMutex(thread->drawmutex);
4894                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4895                         {
4896                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4897                                 thread->starving = true;
4898                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4899                                 thread->starving = false;
4900                         }
4901                         Thread_UnlockMutex(thread->drawmutex);
4902                 }
4903         }   
4904         return 0;
4905 }
4906
4907 static void DPSOFTRAST_Draw_FlushThreads(void)
4908 {
4909         DPSOFTRAST_State_Thread *thread;
4910         int i;
4911         DPSOFTRAST_Draw_SyncCommands();
4912         if (dpsoftrast.usethreads) 
4913         {
4914                 for (i = 0; i < dpsoftrast.numthreads; i++)
4915                 {
4916                         thread = &dpsoftrast.threads[i];
4917                         if (thread->commandoffset != dpsoftrast.drawcommand)
4918                         {
4919                                 Thread_LockMutex(thread->drawmutex);
4920                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4921                                         Thread_CondSignal(thread->drawcond);
4922                                 Thread_UnlockMutex(thread->drawmutex);
4923                         }
4924                 }
4925                 for (i = 0; i < dpsoftrast.numthreads; i++)
4926                 {
4927                         thread = &dpsoftrast.threads[i];
4928                         if (thread->commandoffset != dpsoftrast.drawcommand)
4929                         {
4930                                 Thread_LockMutex(thread->drawmutex);
4931                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4932                                 {
4933                                         thread->waiting = true;
4934                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4935                                         thread->waiting = false;
4936                                 }
4937                                 Thread_UnlockMutex(thread->drawmutex);
4938                         }
4939                 }
4940         }
4941         else
4942         {
4943                 for (i = 0; i < dpsoftrast.numthreads; i++)
4944                 {
4945                         thread = &dpsoftrast.threads[i];
4946                         if (thread->commandoffset != dpsoftrast.drawcommand)
4947                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4948                 }
4949         }
4950         dpsoftrast.commandpool.usedcommands = 0;
4951 }
4952
4953 void DPSOFTRAST_Flush(void)
4954 {
4955         DPSOFTRAST_Draw_FlushThreads();
4956 }
4957
4958 void DPSOFTRAST_Finish(void)
4959 {
4960         DPSOFTRAST_Flush();
4961 }
4962
4963 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4964 {
4965         int i;
4966         union
4967         {
4968                 int i;
4969                 unsigned char b[4];
4970         }
4971         u;
4972         u.i = 1;
4973         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4974         dpsoftrast.bigendian = u.b[3];
4975         dpsoftrast.fb_width = width;
4976         dpsoftrast.fb_height = height;
4977         dpsoftrast.fb_depthpixels = depthpixels;
4978         dpsoftrast.fb_colorpixels[0] = colorpixels;
4979         dpsoftrast.fb_colorpixels[1] = NULL;
4980         dpsoftrast.fb_colorpixels[1] = NULL;
4981         dpsoftrast.fb_colorpixels[1] = NULL;
4982         dpsoftrast.viewport[0] = 0;
4983         dpsoftrast.viewport[1] = 0;
4984         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4985         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4986         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4987         dpsoftrast.texture_firstfree = 1;
4988         dpsoftrast.texture_end = 1;
4989         dpsoftrast.texture_max = 0;
4990         dpsoftrast.color[0] = 1;
4991         dpsoftrast.color[1] = 1;
4992         dpsoftrast.color[2] = 1;
4993         dpsoftrast.color[3] = 1;
4994         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4995         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4996         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4997         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4998         for (i = 0; i < dpsoftrast.numthreads; i++)
4999         {
5000                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5001                 thread->index = i;
5002                 thread->cullface = GL_BACK;
5003                 thread->colormask[1] = 1;
5004                 thread->colormask[2] = 1;
5005                 thread->colormask[3] = 1;
5006                 thread->blendfunc[0] = GL_ONE;
5007                 thread->blendfunc[1] = GL_ZERO;
5008                 thread->depthmask = true;
5009                 thread->depthtest = true;
5010                 thread->depthfunc = GL_LEQUAL;
5011                 thread->scissortest = false;
5012                 thread->alphatest = false;
5013                 thread->alphafunc = GL_GREATER;
5014                 thread->alphavalue = 0.5f;
5015                 thread->viewport[0] = 0;
5016                 thread->viewport[1] = 0;
5017                 thread->viewport[2] = dpsoftrast.fb_width;
5018                 thread->viewport[3] = dpsoftrast.fb_height;
5019                 thread->scissor[0] = 0;
5020                 thread->scissor[1] = 0;
5021                 thread->scissor[2] = dpsoftrast.fb_width;
5022                 thread->scissor[3] = dpsoftrast.fb_height;
5023                 thread->depthrange[0] = 0;
5024                 thread->depthrange[1] = 1;
5025                 thread->polygonoffset[0] = 0;
5026                 thread->polygonoffset[1] = 0;
5027         
5028                 if (dpsoftrast.interlace)
5029                 {
5030                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5031                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5032                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5033                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5034                 }
5035                 else
5036                 {
5037                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5038                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5039                 }
5040
5041                 thread->numspans = 0;
5042                 thread->numtriangles = 0;
5043                 thread->commandoffset = 0;
5044                 thread->waiting = false;
5045                 thread->starving = false;
5046            
5047                 thread->validate = -1;
5048                 DPSOFTRAST_Validate(thread, -1);
5049  
5050                 if (dpsoftrast.usethreads)
5051                 {
5052                         thread->waitcond = Thread_CreateCond();
5053                         thread->drawcond = Thread_CreateCond();
5054                         thread->drawmutex = Thread_CreateMutex();
5055                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5056                 }
5057         }
5058         return 0;
5059 }
5060
5061 void DPSOFTRAST_Shutdown(void)
5062 {
5063         int i;
5064         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5065         {
5066                 DPSOFTRAST_State_Thread *thread;
5067                 for (i = 0; i < dpsoftrast.numthreads; i++)
5068                 {
5069                         thread = &dpsoftrast.threads[i];
5070                         Thread_LockMutex(thread->drawmutex);
5071                         thread->index = -1;
5072                         Thread_CondSignal(thread->drawcond);
5073                         Thread_UnlockMutex(thread->drawmutex);
5074                         Thread_WaitThread(thread->thread, 0);
5075                         Thread_DestroyCond(thread->waitcond);
5076                         Thread_DestroyCond(thread->drawcond);
5077                         Thread_DestroyMutex(thread->drawmutex);
5078                 }
5079         }
5080         for (i = 0;i < dpsoftrast.texture_end;i++)
5081                 if (dpsoftrast.texture[i].bytes)
5082                         MM_FREE(dpsoftrast.texture[i].bytes);
5083         if (dpsoftrast.texture)
5084                 free(dpsoftrast.texture);
5085         if (dpsoftrast.threads)
5086                 MM_FREE(dpsoftrast.threads);
5087         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5088 }
5089