]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
f87859823e92538b45c0a1c4ee67c210d5f057d5
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int x;
1154         int y;
1155         unsigned char *inpixels;
1156         unsigned char *b;
1157         unsigned char *o;
1158         DPSOFTRAST_Flush();
1159         if (bx1 < 0) bx1 = 0;
1160         if (by1 < 0) by1 = 0;
1161         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1162         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1163         bw = bx2 - bx1;
1164         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1165         if (dpsoftrast.bigendian)
1166         {
1167                 for (y = by1;y < by2;y++)
1168                 {
1169                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1170                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1171                         for (x = bx1;x < bx2;x++)
1172                         {
1173                                 o[0] = b[3];
1174                                 o[1] = b[2];
1175                                 o[2] = b[1];
1176                                 o[3] = b[0];
1177                                 o += 4;
1178                                 b += 4;
1179                         }
1180                 }
1181         }
1182         else
1183         {
1184                 for (y = by1;y < by2;y++)
1185                 {
1186                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1187                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1188                         memcpy(o, b, bw*4);
1189                 }
1190         }
1191
1192 }
1193 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1194 {
1195         int tx1 = tx;
1196         int ty1 = ty;
1197         int tx2 = tx + width;
1198         int ty2 = ty + height;
1199         int sx1 = sx;
1200         int sy1 = sy;
1201         int sx2 = sx + width;
1202         int sy2 = sy + height;
1203         int swidth;
1204         int sheight;
1205         int twidth;
1206         int theight;
1207         int sw;
1208         int sh;
1209         int tw;
1210         int th;
1211         int y;
1212         unsigned int *spixels;
1213         unsigned int *tpixels;
1214         DPSOFTRAST_Texture *texture;
1215         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1216         if (mip < 0 || mip >= texture->mipmaps) return;
1217         DPSOFTRAST_Flush();
1218         spixels = dpsoftrast.fb_colorpixels[0];
1219         swidth = dpsoftrast.fb_width;
1220         sheight = dpsoftrast.fb_height;
1221         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1222         twidth = texture->mipmap[mip][2];
1223         theight = texture->mipmap[mip][3];
1224         if (tx1 < 0) tx1 = 0;
1225         if (ty1 < 0) ty1 = 0;
1226         if (tx2 > twidth) tx2 = twidth;
1227         if (ty2 > theight) ty2 = theight;
1228         if (sx1 < 0) sx1 = 0;
1229         if (sy1 < 0) sy1 = 0;
1230         if (sx2 > swidth) sx2 = swidth;
1231         if (sy2 > sheight) sy2 = sheight;
1232         tw = tx2 - tx1;
1233         th = ty2 - ty1;
1234         sw = sx2 - sx1;
1235         sh = sy2 - sy1;
1236         if (tw > sw) tw = sw;
1237         if (th > sh) th = sh;
1238         if (tw < 1 || th < 1)
1239                 return;
1240         for (y = 0;y < th;y++)
1241                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1242         if (texture->mipmaps > 1)
1243                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1244 }
1245
1246 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1247 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1248 {
1249         if (thread->texbound[command->unitnum])
1250                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1251         thread->texbound[command->unitnum] = command->texture;
1252 }
1253 void DPSOFTRAST_SetTexture(int unitnum, int index)
1254 {
1255         DPSOFTRAST_Command_SetTexture *command;
1256         DPSOFTRAST_Texture *texture;
1257         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1258         {
1259                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1260                 return;
1261         }
1262         texture = DPSOFTRAST_Texture_GetByIndex(index);
1263         if (index && !texture)
1264         {
1265                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1266                 return;
1267         }
1268
1269         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1270         command->unitnum = unitnum;
1271         command->texture = texture;
1272
1273         dpsoftrast.texbound[unitnum] = texture;
1274         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1275 }
1276
1277 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1278 {
1279         dpsoftrast.pointer_vertex3f = vertex3f;
1280         dpsoftrast.stride_vertex = stride;
1281 }
1282 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1283 {
1284         dpsoftrast.pointer_color4f = color4f;
1285         dpsoftrast.pointer_color4ub = NULL;
1286         dpsoftrast.stride_color = stride;
1287 }
1288 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1289 {
1290         dpsoftrast.pointer_color4f = NULL;
1291         dpsoftrast.pointer_color4ub = color4ub;
1292         dpsoftrast.stride_color = stride;
1293 }
1294 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1295 {
1296         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1297         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1298         dpsoftrast.stride_texcoord[unitnum] = stride;
1299 }
1300
1301 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1302 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1303 {
1304         thread->shader_mode = command->mode;
1305         thread->shader_permutation = command->permutation;
1306 }
1307 void DPSOFTRAST_SetShader(int mode, int permutation)
1308 {
1309         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1310         command->mode = mode;
1311         command->permutation = permutation;
1312
1313         dpsoftrast.shader_mode = mode;
1314         dpsoftrast.shader_permutation = permutation;
1315 }
1316
1317 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1318 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1319 {
1320         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1321 }
1322 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1323 {
1324         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1325         command->index = index;
1326         command->val[0] = v0;
1327         command->val[1] = v1;
1328         command->val[2] = v2;
1329         command->val[3] = v3;
1330
1331         dpsoftrast.uniform4f[index*4+0] = v0;
1332         dpsoftrast.uniform4f[index*4+1] = v1;
1333         dpsoftrast.uniform4f[index*4+2] = v2;
1334         dpsoftrast.uniform4f[index*4+3] = v3;
1335 }
1336 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1337 {
1338         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1339         command->index = index;
1340         memcpy(command->val, v, sizeof(command->val));
1341
1342         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1343 }
1344
1345 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1346 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1347 {
1348         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 }
1350 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1351 {
1352 #ifdef SSE2_PRESENT
1353         int i, index;
1354         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1355         {
1356                 __m128 m0, m1, m2, m3;
1357                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1358                 command->index = (DPSOFTRAST_UNIFORM)index;
1359                 if (((size_t)v)&(ALIGN_SIZE-1))
1360                 {
1361                         m0 = _mm_loadu_ps(v);
1362                         m1 = _mm_loadu_ps(v+4);
1363                         m2 = _mm_loadu_ps(v+8);
1364                         m3 = _mm_loadu_ps(v+12);
1365                 }
1366                 else
1367                 {
1368                         m0 = _mm_load_ps(v);
1369                         m1 = _mm_load_ps(v+4);
1370                         m2 = _mm_load_ps(v+8);
1371                         m3 = _mm_load_ps(v+12);
1372                 }
1373                 if (transpose)
1374                 {
1375                         __m128 t0, t1, t2, t3;
1376                         t0 = _mm_unpacklo_ps(m0, m1);
1377                         t1 = _mm_unpacklo_ps(m2, m3);
1378                         t2 = _mm_unpackhi_ps(m0, m1);
1379                         t3 = _mm_unpackhi_ps(m2, m3);
1380                         m0 = _mm_movelh_ps(t0, t1);
1381                         m1 = _mm_movehl_ps(t1, t0);
1382                         m2 = _mm_movelh_ps(t2, t3);
1383                         m3 = _mm_movehl_ps(t3, t2);                     
1384                 }
1385                 _mm_store_ps(command->val, m0);
1386                 _mm_store_ps(command->val+4, m1);
1387                 _mm_store_ps(command->val+8, m2);
1388                 _mm_store_ps(command->val+12, m3);
1389                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1390                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1391                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1393         }
1394 #endif
1395 }
1396
1397 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1398 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1399 {
1400         thread->uniform1i[command->index] = command->val;
1401 }
1402 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1403 {
1404         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1405         command->index = index;
1406         command->val = i0;
1407
1408         dpsoftrast.uniform1i[command->index] = i0;
1409 }
1410
1411 #ifdef SSE2_PRESENT
1412 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1413 {
1414         float *end = dst + size*4;
1415         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1416         {
1417                 while (dst < end)
1418                 {
1419                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1420                         dst += 4;
1421                         src += stride;
1422                 }
1423         }
1424         else
1425         {
1426                 while (dst < end)
1427                 {
1428                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1429                         dst += 4;
1430                         src += stride;
1431                 }
1432         }
1433 }
1434
1435 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if (stride == sizeof(float[3]))
1439         {
1440                 float *end4 = dst + (size&~3)*4;        
1441                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1442                 {
1443                         while (dst < end4)
1444                         {
1445                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1446                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1447                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1453                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1454                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1455                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1456                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dst += 16;
1459                                 src += 4*sizeof(float[3]);
1460                         }
1461                 }
1462                 else
1463                 {
1464                         while (dst < end4)
1465                         {
1466                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1467                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1468                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1474                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1475                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1476                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dst += 16;
1480                                 src += 4*sizeof(float[3]);
1481                         }
1482                 }
1483         }
1484         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1485         {
1486                 while (dst < end)
1487                 {
1488                         __m128 v = _mm_loadu_ps((const float *)src);
1489                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1490                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1491                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1492                         _mm_store_ps(dst, v);
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497         else
1498         {
1499                 while (dst < end)
1500                 {
1501                         __m128 v = _mm_load_ps((const float *)src);
1502                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1503                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1505                         _mm_store_ps(dst, v);
1506                         dst += 4;
1507                         src += stride;
1508                 }
1509         }
1510 }
1511
1512 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1513 {
1514         float *end = dst + size*4;
1515         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1516         if (stride == sizeof(float[2]))
1517         {
1518                 float *end2 = dst + (size&~1)*4;
1519                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1520                 {
1521                         while (dst < end2)
1522                         {
1523                                 __m128 v = _mm_loadu_ps((const float *)src);
1524                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1525                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1526                                 dst += 8;
1527                                 src += 2*sizeof(float[2]);
1528                         }
1529                 }
1530                 else
1531                 {
1532                         while (dst < end2)
1533                         {
1534                                 __m128 v = _mm_load_ps((const float *)src);
1535                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1536                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537                                 dst += 8;
1538                                 src += 2*sizeof(float[2]);
1539                         }
1540                 }
1541         }
1542         while (dst < end)
1543         {
1544                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1545                 dst += 4;
1546                 src += stride;
1547         }
1548 }
1549
1550 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1551 {
1552         float *end = dst + size*4;
1553         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1554         if (stride == sizeof(unsigned char[4]))
1555         {
1556                 float *end4 = dst + (size&~3)*4;
1557                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1558                 {
1559                         while (dst < end4)
1560                         {
1561                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1562                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1563                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1564                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1565                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1566                                 dst += 16;
1567                                 src += 4*sizeof(unsigned char[4]);
1568                         }
1569                 }
1570                 else
1571                 {
1572                         while (dst < end4)
1573                         {
1574                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1575                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1576                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1577                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1579                                 dst += 16;
1580                                 src += 4*sizeof(unsigned char[4]);
1581                         }
1582                 }
1583         }
1584         while (dst < end)
1585         {
1586                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1587                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1588                 dst += 4;
1589                 src += stride;
1590         }
1591 }
1592
1593 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1594 {
1595         float *end = dst + 4*size;
1596         __m128 v = _mm_loadu_ps(src);
1597         while (dst < end)
1598         {
1599                 _mm_store_ps(dst, v);
1600                 dst += 4;
1601         }
1602 }
1603 #endif
1604
1605 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1606 {
1607 #ifdef SSE2_PRESENT
1608         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1609         __m128 m0, m1, m2, m3;
1610         float *end;
1611         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1612         {
1613                 // fast case for identity matrix
1614                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1615                 return;
1616         }
1617         end = out4f + numitems*4;
1618         m0 = _mm_loadu_ps(inmatrix16f);
1619         m1 = _mm_loadu_ps(inmatrix16f + 4);
1620         m2 = _mm_loadu_ps(inmatrix16f + 8);
1621         m3 = _mm_loadu_ps(inmatrix16f + 12);
1622         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1623         {
1624                 while (out4f < end)
1625                 {
1626                         __m128 v = _mm_loadu_ps(in4f);
1627                         _mm_store_ps(out4f,
1628                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1629                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1630                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1631                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1632                         out4f += 4;
1633                         in4f += 4;
1634                 }
1635         }
1636         else
1637         {
1638                 while (out4f < end)
1639                 {
1640                         __m128 v = _mm_load_ps(in4f);
1641                         _mm_store_ps(out4f,
1642                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1643                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1644                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1645                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1646                         out4f += 4;
1647                         in4f += 4;
1648                 }
1649         }
1650 #endif
1651 }
1652
1653 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1654 {
1655         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1656 }
1657
1658 #ifdef SSE2_PRESENT
1659 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1660 { \
1661         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1662         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1663         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1664         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1665 }
1666
1667 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1668 { \
1669         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1670         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1671         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1672         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1673 }
1674
1675 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1676 { \
1677         __m128 p = (in); \
1678         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1679                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1680                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1681                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1682 }
1683
1684 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1685 {
1686         int clipmask = 0xFF;
1687         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1688         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1689         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1690         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1691         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1692         #define BBFRONT(k, pos) \
1693         { \
1694                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1695                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1697                 { \
1698                         __m128 proj; \
1699                         clipmask &= ~(1<<k); \
1700                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1701                         minproj = _mm_min_ss(minproj, proj); \
1702                         maxproj = _mm_max_ss(maxproj, proj); \
1703                 } \
1704         }
1705         BBFRONT(0, minpos); 
1706         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1707         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1708         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1709         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1710         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1712         BBFRONT(7, maxpos);
1713         #define BBCLIP(k) \
1714         { \
1715                 if (clipmask&(1<<k)) \
1716                 { \
1717                         if (!(clipmask&(1<<(k^1)))) \
1718                         { \
1719                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1720                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1721                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1722                                 minproj = _mm_min_ss(minproj, proj); \
1723                                 maxproj = _mm_max_ss(maxproj, proj); \
1724                         } \
1725                         if (!(clipmask&(1<<(k^2)))) \
1726                         { \
1727                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1728                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1729                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1730                                 minproj = _mm_min_ss(minproj, proj); \
1731                                 maxproj = _mm_max_ss(maxproj, proj); \
1732                         } \
1733                         if (!(clipmask&(1<<(k^4)))) \
1734                         { \
1735                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1736                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1737                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1738                                 minproj = _mm_min_ss(minproj, proj); \
1739                                 maxproj = _mm_max_ss(maxproj, proj); \
1740                         } \
1741                 } \
1742         }
1743         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1744         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1745         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1746         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1747         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1748         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1749         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1750         *starty = _mm_cvttss_si32(maxproj);
1751         *endy = _mm_cvttss_si32(minproj)+1;
1752         return clipmask;
1753 }
1754         
1755 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1756 {
1757         float *end = out4f + numitems*4;
1758         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1759         __m128 minpos, maxpos;
1760         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1761         {
1762                 minpos = maxpos = _mm_loadu_ps(in4f);
1763                 while (out4f < end)
1764                 {
1765                         __m128 v = _mm_loadu_ps(in4f);
1766                         minpos = _mm_min_ps(minpos, v);
1767                         maxpos = _mm_max_ps(maxpos, v);
1768                         _mm_store_ps(out4f, v);
1769                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1770                         _mm_store_ps(screen4f, v);
1771                         in4f += 4;
1772                         out4f += 4;
1773                         screen4f += 4;
1774                 }
1775         }
1776         else
1777         {
1778                 minpos = maxpos = _mm_load_ps(in4f);
1779                 while (out4f < end)
1780                 {
1781                         __m128 v = _mm_load_ps(in4f);
1782                         minpos = _mm_min_ps(minpos, v);
1783                         maxpos = _mm_max_ps(maxpos, v);
1784                         _mm_store_ps(out4f, v);
1785                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1786                         _mm_store_ps(screen4f, v);
1787                         in4f += 4;
1788                         out4f += 4;
1789                         screen4f += 4;
1790                 }
1791         }
1792         if (starty && endy) 
1793                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1794                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1795                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1796                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1797                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1798         return 0;
1799 }
1800
1801 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1802 {
1803         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805         float *end;
1806         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808         end = out4f + numitems*4;
1809         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811         m0 = _mm_loadu_ps(inmatrix16f);
1812         m1 = _mm_loadu_ps(inmatrix16f + 4);
1813         m2 = _mm_loadu_ps(inmatrix16f + 8);
1814         m3 = _mm_loadu_ps(inmatrix16f + 12);
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824                         _mm_store_ps(out4f, v);
1825                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826                         _mm_store_ps(screen4f, v);
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832         else
1833         {
1834                 minpos = maxpos = _mm_load_ps(in4f);
1835                 while (out4f < end)
1836                 {
1837                         __m128 v = _mm_load_ps(in4f);
1838                         minpos = _mm_min_ps(minpos, v);
1839                         maxpos = _mm_max_ps(maxpos, v);
1840                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1851         return 0;
1852 }
1853 #endif
1854
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 {
1857 #ifdef SSE2_PRESENT
1858         float *outf = dpsoftrast.post_array4f[outarray];
1859         const unsigned char *inb;
1860         int firstvertex = dpsoftrast.firstvertex;
1861         int numvertices = dpsoftrast.numvertices;
1862         int stride;
1863         switch(inarray)
1864         {
1865         case DPSOFTRAST_ARRAY_POSITION:
1866                 stride = dpsoftrast.stride_vertex;
1867                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1868                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1869                 break;
1870         case DPSOFTRAST_ARRAY_COLOR:
1871                 stride = dpsoftrast.stride_color;
1872                 if (dpsoftrast.pointer_color4f)
1873                 {
1874                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1875                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1876                 }
1877                 else if (dpsoftrast.pointer_color4ub)
1878                 {
1879                         stride = dpsoftrast.stride_color;
1880                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1881                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1882                 }
1883                 else
1884                 {
1885                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1886                 }
1887                 break;
1888         default:
1889                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1890                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1891                 {
1892                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1893                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894                         {
1895                         case 2:
1896                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1897                                 break;
1898                         case 3:
1899                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1900                                 break;
1901                         case 4:
1902                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1903                                 break;
1904                         }
1905                 }
1906                 break;
1907         }
1908         return outf;
1909 #else
1910         return NULL;
1911 #endif
1912 }
1913
1914 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1915 {
1916         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1917         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1918         return data;
1919 }
1920
1921 #if 0
1922 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1923 {
1924 #ifdef SSE2_PRESENT
1925         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1927         return data;
1928 #else
1929         return NULL;
1930 #endif
1931 }
1932 #endif
1933
1934 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1935 {
1936 #ifdef SSE2_PRESENT
1937         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1938         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1939         return data;
1940 #else
1941         return NULL;
1942 #endif
1943 }
1944
1945 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1946 {
1947         int x;
1948         int startx = span->startx;
1949         int endx = span->endx;
1950         float wslope = triangle->w[0];
1951         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1952         float endz = 1.0f / (w + wslope * startx);
1953         for (x = startx;x < endx;)
1954         {
1955                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1956                 float z = endz, dz;
1957                 if (nextsub >= endx) nextsub = endsub = endx-1;
1958                 endz = 1.0f / (w + wslope * nextsub);
1959                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1960                 for (; x <= endsub; x++, z += dz)
1961                         zf[x] = z;
1962         }
1963 }
1964
1965 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1966 {
1967         int x;
1968         int startx = span->startx;
1969         int endx = span->endx;
1970         int d[4];
1971         float a, b;
1972         unsigned char * RESTRICT pixelmask = span->pixelmask;
1973         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1974         if (!pixel)
1975                 return;
1976         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1977         // handle alphatest now (this affects depth writes too)
1978         if (thread->alphatest)
1979                 for (x = startx;x < endx;x++)
1980                         if (in4f[x*4+3] < 0.5f)
1981                                 pixelmask[x] = false;
1982         // FIXME: this does not handle bigendian
1983         switch(thread->fb_blendmode)
1984         {
1985         case DPSOFTRAST_BLENDMODE_OPAQUE:
1986                 for (x = startx;x < endx;x++)
1987                 {
1988                         if (!pixelmask[x])
1989                                 continue;
1990                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1991                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1992                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1993                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1994                         pixel[x*4+0] = d[0];
1995                         pixel[x*4+1] = d[1];
1996                         pixel[x*4+2] = d[2];
1997                         pixel[x*4+3] = d[3];
1998                 }
1999                 break;
2000         case DPSOFTRAST_BLENDMODE_ALPHA:
2001                 for (x = startx;x < endx;x++)
2002                 {
2003                         if (!pixelmask[x])
2004                                 continue;
2005                         a = in4f[x*4+3] * 255.0f;
2006                         b = 1.0f - in4f[x*4+3];
2007                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2008                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2009                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2010                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2011                         pixel[x*4+0] = d[0];
2012                         pixel[x*4+1] = d[1];
2013                         pixel[x*4+2] = d[2];
2014                         pixel[x*4+3] = d[3];
2015                 }
2016                 break;
2017         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2018                 for (x = startx;x < endx;x++)
2019                 {
2020                         if (!pixelmask[x])
2021                                 continue;
2022                         a = in4f[x*4+3] * 255.0f;
2023                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027                         pixel[x*4+0] = d[0];
2028                         pixel[x*4+1] = d[1];
2029                         pixel[x*4+2] = d[2];
2030                         pixel[x*4+3] = d[3];
2031                 }
2032                 break;
2033         case DPSOFTRAST_BLENDMODE_ADD:
2034                 for (x = startx;x < endx;x++)
2035                 {
2036                         if (!pixelmask[x])
2037                                 continue;
2038                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042                         pixel[x*4+0] = d[0];
2043                         pixel[x*4+1] = d[1];
2044                         pixel[x*4+2] = d[2];
2045                         pixel[x*4+3] = d[3];
2046                 }
2047                 break;
2048         case DPSOFTRAST_BLENDMODE_INVMOD:
2049                 for (x = startx;x < endx;x++)
2050                 {
2051                         if (!pixelmask[x])
2052                                 continue;
2053                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_MUL:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072                         pixel[x*4+0] = d[0];
2073                         pixel[x*4+1] = d[1];
2074                         pixel[x*4+2] = d[2];
2075                         pixel[x*4+3] = d[3];
2076                 }
2077                 break;
2078         case DPSOFTRAST_BLENDMODE_MUL2:
2079                 for (x = startx;x < endx;x++)
2080                 {
2081                         if (!pixelmask[x])
2082                                 continue;
2083                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2084                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2085                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2086                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2087                         pixel[x*4+0] = d[0];
2088                         pixel[x*4+1] = d[1];
2089                         pixel[x*4+2] = d[2];
2090                         pixel[x*4+3] = d[3];
2091                 }
2092                 break;
2093         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2094                 for (x = startx;x < endx;x++)
2095                 {
2096                         if (!pixelmask[x])
2097                                 continue;
2098                         a = in4f[x*4+3] * -255.0f;
2099                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2100                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2101                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2102                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2103                         pixel[x*4+0] = d[0];
2104                         pixel[x*4+1] = d[1];
2105                         pixel[x*4+2] = d[2];
2106                         pixel[x*4+3] = d[3];
2107                 }
2108                 break;
2109         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2110                 for (x = startx;x < endx;x++)
2111                 {
2112                         if (!pixelmask[x])
2113                                 continue;
2114                         a = 255.0f;
2115                         b = 1.0f - in4f[x*4+3];
2116                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2117                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2118                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2119                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2120                         pixel[x*4+0] = d[0];
2121                         pixel[x*4+1] = d[1];
2122                         pixel[x*4+2] = d[2];
2123                         pixel[x*4+3] = d[3];
2124                 }
2125                 break;
2126         case DPSOFTRAST_BLENDMODE_INVADD:
2127                 for (x = startx;x < endx;x++)
2128                 {
2129                         if (!pixelmask[x])
2130                                 continue;
2131                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2132                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2133                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2134                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2135                         pixel[x*4+0] = d[0];
2136                         pixel[x*4+1] = d[1];
2137                         pixel[x*4+2] = d[2];
2138                         pixel[x*4+3] = d[3];
2139                 }
2140                 break;
2141         }
2142 }
2143
2144 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2145 {
2146 #ifdef SSE2_PRESENT
2147         int x;
2148         int startx = span->startx;
2149         int endx = span->endx;
2150         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2151         unsigned char * RESTRICT pixelmask = span->pixelmask;
2152         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2153         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2154         if (!pixel)
2155                 return;
2156         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2157         pixeli += span->y * dpsoftrast.fb_width + span->x;
2158         // handle alphatest now (this affects depth writes too)
2159         if (thread->alphatest)
2160                 for (x = startx;x < endx;x++)
2161                         if (in4ub[x*4+3] < 0.5f)
2162                                 pixelmask[x] = false;
2163         // FIXME: this does not handle bigendian
2164         switch(thread->fb_blendmode)
2165         {
2166         case DPSOFTRAST_BLENDMODE_OPAQUE:
2167                 for (x = startx;x + 4 <= endx;)
2168                 {
2169                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2170                         {
2171                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172                                 x += 4;
2173                         }
2174                         else
2175                         {
2176                                 if (pixelmask[x])
2177                                         pixeli[x] = ini[x];
2178                                 x++;
2179                         }
2180                 }
2181                 for (;x < endx;x++)
2182                         if (pixelmask[x])
2183                                 pixeli[x] = ini[x];
2184                 break;
2185         case DPSOFTRAST_BLENDMODE_ALPHA:
2186         #define FINISHBLEND(blend2, blend1) \
2187                 for (x = startx;x + 1 < endx;x += 2) \
2188                 { \
2189                         __m128i src, dst; \
2190                         switch (*(const unsigned short*)&pixelmask[x]) \
2191                         { \
2192                         case 0x0101: \
2193                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2194                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2195                                 blend2; \
2196                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2197                                 continue; \
2198                         case 0x0100: \
2199                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2200                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2201                                 blend1; \
2202                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2203                                 continue; \
2204                         case 0x0001: \
2205                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2207                                 blend1; \
2208                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2209                                 continue; \
2210                         } \
2211                         break; \
2212                 } \
2213                 for(;x < endx; x++) \
2214                 { \
2215                         __m128i src, dst; \
2216                         if (!pixelmask[x]) \
2217                                 continue; \
2218                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                         blend1; \
2221                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                 }
2223
2224                 FINISHBLEND({
2225                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                 }, {
2228                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                 });
2231                 break;
2232         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                 FINISHBLEND({
2234                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                 }, {
2237                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                 });
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_ADD:
2242                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                 break;
2244         case DPSOFTRAST_BLENDMODE_INVMOD:
2245                 FINISHBLEND({
2246                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                 }, {
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_MUL:
2252                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_MUL2:
2255                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                 break;
2257         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                 FINISHBLEND({
2259                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                 }, {
2262                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                 });
2265                 break;
2266         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                 FINISHBLEND({
2268                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                 }, {
2271                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                 });
2274                 break;
2275         case DPSOFTRAST_BLENDMODE_INVADD:
2276                 FINISHBLEND({
2277                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                 }, {
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 });
2281                 break;
2282         }
2283 #endif
2284 }
2285
2286 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2287 {
2288         int x;
2289         int startx = span->startx;
2290         int endx = span->endx;
2291         int flags;
2292         float c[4];
2293         float data[4];
2294         float slope[4];
2295         float tc[2], endtc[2];
2296         float tcscale[2];
2297         unsigned int tci[2];
2298         unsigned int tci1[2];
2299         unsigned int tcimin[2];
2300         unsigned int tcimax[2];
2301         int tciwrapmask[2];
2302         int tciwidth;
2303         int filter;
2304         int mip;
2305         const unsigned char * RESTRICT pixelbase;
2306         const unsigned char * RESTRICT pixel[4];
2307         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2308         // if no texture is bound, just fill it with white
2309         if (!texture)
2310         {
2311                 for (x = startx;x < endx;x++)
2312                 {
2313                         out4f[x*4+0] = 1.0f;
2314                         out4f[x*4+1] = 1.0f;
2315                         out4f[x*4+2] = 1.0f;
2316                         out4f[x*4+3] = 1.0f;
2317                 }
2318                 return;
2319         }
2320         mip = triangle->mip[texunitindex];
2321         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322         // if this mipmap of the texture is 1 pixel, just fill it with that color
2323         if (texture->mipmap[mip][1] == 4)
2324         {
2325                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2326                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2327                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2328                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2329                 for (x = startx;x < endx;x++)
2330                 {
2331                         out4f[x*4+0] = c[0];
2332                         out4f[x*4+1] = c[1];
2333                         out4f[x*4+2] = c[2];
2334                         out4f[x*4+3] = c[3];
2335                 }
2336                 return;
2337         }
2338         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2339         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2340         flags = texture->flags;
2341         tcscale[0] = texture->mipmap[mip][2];
2342         tcscale[1] = texture->mipmap[mip][3];
2343         tciwidth = texture->mipmap[mip][2];
2344         tcimin[0] = 0;
2345         tcimin[1] = 0;
2346         tcimax[0] = texture->mipmap[mip][2]-1;
2347         tcimax[1] = texture->mipmap[mip][3]-1;
2348         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2349         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2350         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2351         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2352         for (x = startx;x < endx;)
2353         {
2354                 unsigned int subtc[2];
2355                 unsigned int substep[2];
2356                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2357                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2358                 if (nextsub >= endx)
2359                 {
2360                         nextsub = endsub = endx-1;      
2361                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2362                 }
2363                 tc[0] = endtc[0];
2364                 tc[1] = endtc[1];
2365                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2366                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2367                 substep[0] = (endtc[0] - tc[0]) * subscale;
2368                 substep[1] = (endtc[1] - tc[1]) * subscale;
2369                 subtc[0] = tc[0] * (1<<16);
2370                 subtc[1] = tc[1] * (1<<16);
2371                 if (filter)
2372                 {
2373                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2374                         {
2375                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2376                                 {
2377                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2378                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2379                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2380                                         tci[0] = subtc[0]>>16;
2381                                         tci[1] = subtc[1]>>16;
2382                                         tci1[0] = tci[0] + 1;
2383                                         tci1[1] = tci[1] + 1;
2384                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2385                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2386                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2387                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2388                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2389                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2390                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2391                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2392                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2393                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2394                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         out4f[x*4+0] = c[0];
2397                                         out4f[x*4+1] = c[1];
2398                                         out4f[x*4+2] = c[2];
2399                                         out4f[x*4+3] = c[3];
2400                                 }
2401                         }
2402                         else
2403                         {
2404                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2405                                 {
2406                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2407                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2408                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2409                                         tci[0] = subtc[0]>>16;
2410                                         tci[1] = subtc[1]>>16;
2411                                         tci1[0] = tci[0] + 1;
2412                                         tci1[1] = tci[1] + 1;
2413                                         tci[0] &= tciwrapmask[0];
2414                                         tci[1] &= tciwrapmask[1];
2415                                         tci1[0] &= tciwrapmask[0];
2416                                         tci1[1] &= tciwrapmask[1];
2417                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2419                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2420                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2421                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2422                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2423                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         out4f[x*4+0] = c[0];
2426                                         out4f[x*4+1] = c[1];
2427                                         out4f[x*4+2] = c[2];
2428                                         out4f[x*4+3] = c[3];
2429                                 }
2430                         }
2431                 }
2432                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2433                 {
2434                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2435                         {
2436                                 tci[0] = subtc[0]>>16;
2437                                 tci[1] = subtc[1]>>16;
2438                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2439                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2440                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2442                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2443                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2444                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2445                                 out4f[x*4+0] = c[0];
2446                                 out4f[x*4+1] = c[1];
2447                                 out4f[x*4+2] = c[2];
2448                                 out4f[x*4+3] = c[3];
2449                         }
2450                 }
2451                 else
2452                 {
2453                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2454                         {
2455                                 tci[0] = subtc[0]>>16;
2456                                 tci[1] = subtc[1]>>16;
2457                                 tci[0] &= tciwrapmask[0];
2458                                 tci[1] &= tciwrapmask[1];
2459                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464                                 out4f[x*4+0] = c[0];
2465                                 out4f[x*4+1] = c[1];
2466                                 out4f[x*4+2] = c[2];
2467                                 out4f[x*4+3] = c[3];
2468                         }
2469                 }
2470         }
2471 }
2472
2473 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2474 {
2475 #ifdef SSE2_PRESENT
2476         int x;
2477         int startx = span->startx;
2478         int endx = span->endx;
2479         int flags;
2480         __m128 data, slope, tcscale;
2481         __m128i tcsize, tcmask, tcoffset, tcmax;
2482         __m128 tc, endtc;
2483         __m128i subtc, substep, endsubtc;
2484         int filter;
2485         int mip;
2486         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2487         const unsigned char * RESTRICT pixelbase;
2488         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2489         // if no texture is bound, just fill it with white
2490         if (!texture)
2491         {
2492                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2493                 return;
2494         }
2495         mip = triangle->mip[texunitindex];
2496         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2497         // if this mipmap of the texture is 1 pixel, just fill it with that color
2498         if (texture->mipmap[mip][1] == 4)
2499         {
2500                 unsigned int k = *((const unsigned int *)pixelbase);
2501                 for (x = startx;x < endx;x++)
2502                         outi[x] = k;
2503                 return;
2504         }
2505         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2506         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2507         flags = texture->flags;
2508         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2509         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2510         tcscale = _mm_cvtepi32_ps(tcsize);
2511         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2512         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2513         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2514         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2515         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2516         tcmax = _mm_packs_epi32(tcmask, tcmask);
2517         for (x = startx;x < endx;)
2518         {
2519                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2520                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2521                 if (nextsub >= endx)
2522                 {
2523                         nextsub = endsub = endx-1;
2524                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2525                 }       
2526                 tc = endtc;
2527                 subtc = endsubtc;
2528                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2529                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2530                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2531                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2532                 substep = _mm_slli_epi32(substep, 1);
2533                 if (filter)
2534                 {
2535                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2536                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2537                         {
2538                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2539                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2540                                 {
2541                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2542                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2543                                         tci = _mm_madd_epi16(tci, tcoffset);
2544                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2545                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2546                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2547                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2548                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2549                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2550                                         fracm = _mm_srli_epi16(subtc, 1);
2551                                         pix1 = _mm_add_epi16(pix1,
2552                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2553                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2554                                         pix3 = _mm_add_epi16(pix3,
2555                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2556                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2557                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2558                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2559                                         pix2 = _mm_add_epi16(pix2,
2560                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2561                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2562                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2563                                 }
2564                                 if (x <= endsub)
2565                                 {
2566                                         const unsigned char * RESTRICT ptr1;
2567                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2568                                         tci = _mm_madd_epi16(tci, tcoffset);
2569                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2570                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2571                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2572                                         fracm = _mm_srli_epi16(subtc, 1);
2573                                         pix1 = _mm_add_epi16(pix1,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2577                                         pix1 = _mm_add_epi16(pix1,
2578                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2580                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2581                                         x++;
2582                                 }
2583                         }
2584                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2585                         {
2586                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2587                                 {
2588                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2589                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590                                         tci = _mm_madd_epi16(tci, tcoffset);
2591                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593                                                                                         _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596                                                                                         _mm_setzero_si128());
2597                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2598                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2599                                         tci = _mm_madd_epi16(tci, tcoffset);
2600                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602                                                                                         _mm_setzero_si128());
2603                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         fracm = _mm_srli_epi16(subtc, 1);
2607                                         pix1 = _mm_add_epi16(pix1,
2608                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2610                                         pix3 = _mm_add_epi16(pix3,
2611                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2612                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2613                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2614                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2615                                         pix2 = _mm_add_epi16(pix2,
2616                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2617                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2618                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2619                                 }
2620                                 if (x <= endsub)
2621                                 {
2622                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2623                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2627                                                                                         _mm_setzero_si128());
2628                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2630                                                                                         _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2636                                         pix1 = _mm_add_epi16(pix1,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2638                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2639                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2640                                         x++;
2641                                 }
2642                         }
2643                         else
2644                         {
2645                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2646                                 {
2647                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2648                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2657                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661                                                                                         _mm_setzero_si128());
2662                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         fracm = _mm_srli_epi16(subtc, 1);
2666                                         pix1 = _mm_add_epi16(pix1,
2667                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2669                                         pix3 = _mm_add_epi16(pix3,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2671                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2672                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2673                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2674                                         pix2 = _mm_add_epi16(pix2,
2675                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2676                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2677                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2678                                 }
2679                                 if (x <= endsub)
2680                                 {
2681                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2682                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2683                                         tci = _mm_madd_epi16(tci, tcoffset);
2684                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         fracm = _mm_srli_epi16(subtc, 1);
2691                                         pix1 = _mm_add_epi16(pix1,
2692                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2693                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2694                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2695                                         pix1 = _mm_add_epi16(pix1,
2696                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2697                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2698                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2699                                         x++;
2700                                 }
2701                         }
2702                 }
2703                 else
2704                 {
2705                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2706                         {
2707                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2708                                 {
2709                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2710                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2714                                 }
2715                                 if (x <= endsub)
2716                                 {
2717                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2718                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2719                                         tci = _mm_madd_epi16(tci, tcoffset);
2720                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2721                                         x++;
2722                                 }
2723                         }
2724                         else
2725                         {
2726                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2727                                 {
2728                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729                                         tci = _mm_and_si128(tci, tcmax); 
2730                                         tci = _mm_madd_epi16(tci, tcoffset);
2731                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2733                                 }
2734                                 if (x <= endsub)
2735                                 {
2736                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737                                         tci = _mm_and_si128(tci, tcmax); 
2738                                         tci = _mm_madd_epi16(tci, tcoffset);
2739                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2740                                         x++;
2741                                 }
2742                         }
2743                 }
2744         }
2745 #endif
2746 }
2747
2748 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2749 {
2750         // TODO: IMPLEMENT
2751         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2752 }
2753
2754 float DPSOFTRAST_SampleShadowmap(const float *vector)
2755 {
2756         // TODO: IMPLEMENT
2757         return 1.0f;
2758 }
2759
2760 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2761 {
2762         int x;
2763         int startx = span->startx;
2764         int endx = span->endx;
2765         float c[4];
2766         float data[4];
2767         float slope[4];
2768         float z;
2769         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2770         for (x = startx;x < endx;x++)
2771         {
2772                 z = zf[x];
2773                 c[0] = (data[0] + slope[0]*x) * z;
2774                 c[1] = (data[1] + slope[1]*x) * z;
2775                 c[2] = (data[2] + slope[2]*x) * z;
2776                 c[3] = (data[3] + slope[3]*x) * z;
2777                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2778                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2779                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2780                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2781         }
2782 }
2783
2784 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2785 {
2786         int x;
2787         int startx = span->startx;
2788         int endx = span->endx;
2789         float c[4];
2790         float data[4];
2791         float slope[4];
2792         float z;
2793         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2794         for (x = startx;x < endx;x++)
2795         {
2796                 z = zf[x];
2797                 c[0] = (data[0] + slope[0]*x) * z;
2798                 c[1] = (data[1] + slope[1]*x) * z;
2799                 c[2] = (data[2] + slope[2]*x) * z;
2800                 c[3] = (data[3] + slope[3]*x) * z;
2801                 out4f[x*4+0] = c[0];
2802                 out4f[x*4+1] = c[1];
2803                 out4f[x*4+2] = c[2];
2804                 out4f[x*4+3] = c[3];
2805         }
2806 }
2807
2808 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2809 {
2810         int x, startx = span->startx, endx = span->endx;
2811         float c[4], localcolor[4];
2812         localcolor[0] = subcolor[0];
2813         localcolor[1] = subcolor[1];
2814         localcolor[2] = subcolor[2];
2815         localcolor[3] = subcolor[3];
2816         for (x = startx;x < endx;x++)
2817         {
2818                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2819                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2820                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2821                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2822                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2823                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2824                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2825                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2826         }
2827 }
2828
2829 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2830 {
2831         int x, startx = span->startx, endx = span->endx;
2832         for (x = startx;x < endx;x++)
2833         {
2834                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2835                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2836                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2837                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2838         }
2839 }
2840
2841 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2842 {
2843         int x, startx = span->startx, endx = span->endx;
2844         for (x = startx;x < endx;x++)
2845         {
2846                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2847                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2848                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2849                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2850         }
2851 }
2852
2853 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2854 {
2855         int x, startx = span->startx, endx = span->endx;
2856         float a, b;
2857         for (x = startx;x < endx;x++)
2858         {
2859                 a = 1.0f - inb4f[x*4+3];
2860                 b = inb4f[x*4+3];
2861                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2862                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2863                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2864                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         float localcolor[4], ilerp, lerp;
2872         localcolor[0] = color[0];
2873         localcolor[1] = color[1];
2874         localcolor[2] = color[2];
2875         localcolor[3] = color[3];
2876         ilerp = 1.0f - localcolor[3];
2877         lerp = localcolor[3];
2878         for (x = startx;x < endx;x++)
2879         {
2880                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2881                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2882                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2883                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2884         }
2885 }
2886
2887
2888
2889 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2890 {
2891 #ifdef SSE2_PRESENT
2892         int x;
2893         int startx = span->startx;
2894         int endx = span->endx;
2895         __m128 data, slope;
2896         __m128 mod, endmod;
2897         __m128i submod, substep, endsubmod;
2898         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2899         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2900         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2901         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2902         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2903         for (x = startx; x < endx;)
2904         {
2905                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2906                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2907                 if (nextsub >= endx)
2908                 {
2909                         nextsub = endsub = endx-1;
2910                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2911                 }
2912                 mod = endmod;
2913                 submod = endsubmod;
2914                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2915                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2916                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2917                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2918                 substep = _mm_packs_epi32(substep, substep);
2919                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2920                 {
2921                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2922                         pix = _mm_mulhi_epu16(pix, submod);
2923                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2924                 }
2925                 if (x <= endsub)
2926                 {
2927                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2928                         pix = _mm_mulhi_epu16(pix, submod);
2929                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2930                         x++;
2931                 }
2932         }
2933 #endif
2934 }
2935
2936 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2937 {
2938 #ifdef SSE2_PRESENT
2939         int x;
2940         int startx = span->startx;
2941         int endx = span->endx;
2942         __m128 data, slope;
2943         __m128 mod, endmod;
2944         __m128i submod, substep, endsubmod;
2945         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2946         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2947         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2948         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2949         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2950         for (x = startx; x < endx;)
2951         {
2952                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2953                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2954                 if (nextsub >= endx)
2955                 {
2956                         nextsub = endsub = endx-1;
2957                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2958                 }
2959                 mod = endmod;
2960                 submod = endsubmod;
2961                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2962                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2963                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2964                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2965                 substep = _mm_packs_epi32(substep, substep);
2966                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2967                 {
2968                         __m128i pix = _mm_srai_epi16(submod, 4);
2969                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2970                 }
2971                 if (x <= endsub)
2972                 {
2973                         __m128i pix = _mm_srai_epi16(submod, 4);
2974                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2975                         x++;
2976                 }
2977         }
2978 #endif
2979 }
2980
2981 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2982 {
2983 #ifdef SSE2_PRESENT
2984         int x, startx = span->startx, endx = span->endx;
2985         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2986         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2987         for (x = startx;x+2 <= endx;x+=2)
2988         {
2989                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2990                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2991                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2992                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2993         }
2994         if (x < endx)
2995         {
2996                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2997                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2998                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2999                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3000         }
3001 #endif
3002 }
3003
3004 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3005 {
3006 #ifdef SSE2_PRESENT
3007         int x, startx = span->startx, endx = span->endx;
3008         for (x = startx;x+2 <= endx;x+=2)
3009         {
3010                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3011                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3012                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3013                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3014         }
3015         if (x < endx)
3016         {
3017                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3018                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3019                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3020                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3021         }
3022 #endif
3023 }
3024
3025 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3026 {
3027 #ifdef SSE2_PRESENT
3028         int x, startx = span->startx, endx = span->endx;
3029         for (x = startx;x+2 <= endx;x+=2)
3030         {
3031                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3032                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3033                 pix1 = _mm_add_epi16(pix1, pix2);
3034                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3035         }
3036         if (x < endx)
3037         {
3038                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3039                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3040                 pix1 = _mm_add_epi16(pix1, pix2);
3041                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3042         }
3043 #endif
3044 }
3045
3046 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3047 {
3048 #ifdef SSE2_PRESENT
3049         int x, startx = span->startx, endx = span->endx;
3050         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3051         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3052         for (x = startx;x+2 <= endx;x+=2)
3053         {
3054                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3056                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3057                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3058         }
3059         if (x < endx)
3060         {
3061                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3063                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3064                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3065         }
3066 #endif
3067 }
3068
3069 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3070 {
3071 #ifdef SSE2_PRESENT
3072         int x, startx = span->startx, endx = span->endx;
3073         for (x = startx;x+2 <= endx;x+=2)
3074         {
3075                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3076                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3077                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3078                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3079                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3080         }
3081         if (x < endx)
3082         {
3083                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3084                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3085                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3086                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3087                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3088         }
3089 #endif
3090 }
3091
3092 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3093 {
3094 #ifdef SSE2_PRESENT
3095         int x, startx = span->startx, endx = span->endx;
3096         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3097         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3098         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3099         for (x = startx;x+2 <= endx;x+=2)
3100         {
3101                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3102                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3103                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3104         }
3105         if (x < endx)
3106         {
3107                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3108                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3109                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3110         }
3111 #endif
3112 }
3113
3114
3115
3116 void DPSOFTRAST_VertexShader_Generic(void)
3117 {
3118         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3119         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3120         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3121         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3122                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3123 }
3124
3125 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3126 {
3127         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3128         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3129         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3132         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3133         {
3134                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3135                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3136                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3137                 {
3138                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3139                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3140                         {
3141                                 // multiply
3142                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3143                         }
3144                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3145                         {
3146                                 // add
3147                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3148                         }
3149                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3150                         {
3151                                 // alphablend
3152                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153                         }
3154                 }
3155         }
3156         else
3157                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3158         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3159 }
3160
3161
3162
3163 void DPSOFTRAST_VertexShader_PostProcess(void)
3164 {
3165         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3166         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3167         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3168 }
3169
3170 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3171 {
3172         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3173         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3174         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3177         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3178         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3179         {
3180                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3181                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3182         }
3183         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3184         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3185         {
3186                 // TODO: implement saturation
3187         }
3188         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3189         {
3190                 // TODO: implement gammaramps
3191         }
3192         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3193 }
3194
3195
3196
3197 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3198 {
3199         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3200 }
3201
3202 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3203 {
3204         // this is never called (because colormask is off when this shader is used)
3205         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3206         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3208         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3209         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3210 }
3211
3212
3213
3214 void DPSOFTRAST_VertexShader_FlatColor(void)
3215 {
3216         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3218 }
3219
3220 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3221 {
3222 #ifdef SSE2_PRESENT
3223         unsigned char * RESTRICT pixelmask = span->pixelmask;
3224         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3225         int x, startx = span->startx, endx = span->endx;
3226         __m128i Color_Ambientm;
3227         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3228         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233                 pixel = buffer_FragColorbgra8;
3234         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238         for (x = startx;x < endx;x++)
3239         {
3240                 __m128i color, pix;
3241                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3242                 {
3243                         __m128i pix2;
3244                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3245                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3246                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3247                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3248                         x += 3;
3249                         continue;
3250                 }
3251                 if (!pixelmask[x])
3252                         continue;
3253                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3254                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3255                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3256         }
3257         if (pixel == buffer_FragColorbgra8)
3258                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3259 #endif
3260 }
3261
3262
3263
3264 void DPSOFTRAST_VertexShader_VertexColor(void)
3265 {
3266         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3268         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3269 }
3270
3271 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3272 {
3273 #ifdef SSE2_PRESENT
3274         unsigned char * RESTRICT pixelmask = span->pixelmask;
3275         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3276         int x, startx = span->startx, endx = span->endx;
3277         __m128i Color_Ambientm, Color_Diffusem;
3278         __m128 data, slope;
3279         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3280         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3281         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3283         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286                 pixel = buffer_FragColorbgra8;
3287         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3294         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3295         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3296         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3297         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3298         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3299         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3300         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3301         {
3302                 __m128i color, mod, pix;
3303                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3304                 {
3305                         __m128i pix2, mod2;
3306                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3307                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3308                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3309                         data = _mm_add_ps(data, slope);
3310                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3311                         data = _mm_add_ps(data, slope);
3312                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3313                         data = _mm_add_ps(data, slope);
3314                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3315                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3316                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3317                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3318                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3319                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3320                         x += 3;
3321                         continue;
3322                 }
3323                 if (!pixelmask[x])
3324                         continue;
3325                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3326                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3327                 mod = _mm_packs_epi32(mod, mod);
3328                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3329                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3330         }
3331         if (pixel == buffer_FragColorbgra8)
3332                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3333 #endif
3334 }
3335
3336
3337
3338 void DPSOFTRAST_VertexShader_Lightmap(void)
3339 {
3340         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3341         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3342         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3343 }
3344
3345 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3346 {
3347 #ifdef SSE2_PRESENT
3348         unsigned char * RESTRICT pixelmask = span->pixelmask;
3349         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3350         int x, startx = span->startx, endx = span->endx;
3351         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3352         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3355         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3359         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3360         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3361                 pixel = buffer_FragColorbgra8;
3362         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3363         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3364         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3365         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3366         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3369         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3370         {
3371                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3372                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3375                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3376                 for (x = startx;x < endx;x++)
3377                 {
3378                         __m128i color, lightmap, glow, pix;
3379                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3380                         {
3381                                 __m128i pix2;
3382                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3383                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3384                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3385                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3386                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3387                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3388                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3389                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3390                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3391                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3392                                 x += 3;
3393                                 continue;
3394                         }
3395                         if (!pixelmask[x])
3396                                 continue;
3397                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3398                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3399                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3400                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3401                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3402                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3403                 }
3404         }
3405         else
3406         {
3407                 for (x = startx;x < endx;x++)
3408                 {
3409                         __m128i color, lightmap, pix;
3410                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3411                         {
3412                                 __m128i pix2;
3413                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3414                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3415                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3416                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3417                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3419                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3420                                 x += 3;
3421                                 continue;
3422                         }
3423                         if (!pixelmask[x]) 
3424                                 continue;
3425                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3426                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3427                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3428                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3429                 }
3430         }
3431         if (pixel == buffer_FragColorbgra8)
3432                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3433 #endif
3434 }
3435
3436
3437
3438 void DPSOFTRAST_VertexShader_FakeLight(void)
3439 {
3440         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3441 }
3442
3443 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3444 {
3445         // TODO: IMPLEMENT
3446         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3447         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3448         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3449         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3450         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3451 }
3452
3453
3454
3455 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3456 {
3457         DPSOFTRAST_VertexShader_Lightmap();
3458 }
3459
3460 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3461 {
3462         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3463         // TODO: IMPLEMENT
3464 }
3465
3466
3467
3468 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3469 {
3470         DPSOFTRAST_VertexShader_Lightmap();
3471 }
3472
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3474 {
3475         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3476         // TODO: IMPLEMENT
3477 }
3478
3479
3480
3481 void DPSOFTRAST_VertexShader_LightDirection(void)
3482 {
3483         int i;
3484         int numvertices = dpsoftrast.numvertices;
3485         float LightDir[4];
3486         float LightVector[4];
3487         float EyePosition[4];
3488         float EyeVectorModelSpace[4];
3489         float EyeVector[4];
3490         float position[4];
3491         float svector[4];
3492         float tvector[4];
3493         float normal[4];
3494         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3495         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3496         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3497         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3498         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3499         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3500         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3501         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3502         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3503         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3504         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3505         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3506         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3507         for (i = 0;i < numvertices;i++)
3508         {
3509                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3510                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3511                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3512                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3513                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3514                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3515                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3516                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3517                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3518                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3519                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3520                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3521                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3522                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3523                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3524                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3525                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3526                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3527                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3528                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3529                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3530                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3531                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3532                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3533                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3534                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3535                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3538         }
3539         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3540 }
3541
3542 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3543 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3544 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3545 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3546 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3547 #define DPSOFTRAST_Vector3Normalize(v)\
3548 do\
3549 {\
3550         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3551         if (len)\
3552         {\
3553                 len = 1.0f / len;\
3554                 v[0] *= len;\
3555                 v[1] *= len;\
3556                 v[2] *= len;\
3557         }\
3558 }\
3559 while(0)
3560
3561 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3562 {
3563         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3564         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3566         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571         int x, startx = span->startx, endx = span->endx;
3572         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3573         float LightVectordata[4];
3574         float LightVectorslope[4];
3575         float EyeVectordata[4];
3576         float EyeVectorslope[4];
3577         float z;
3578         float diffusetex[4];
3579         float glosstex[4];
3580         float surfacenormal[4];
3581         float lightnormal[4];
3582         float eyenormal[4];
3583         float specularnormal[4];
3584         float diffuse;
3585         float specular;
3586         float SpecularPower;
3587         int d[4];
3588         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3589         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3590         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3591         Color_Glow[3] = 0.0f;
3592         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3593         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3594         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3595         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3596         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3597         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3598         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3599         Color_Pants[3] = 0.0f;
3600         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3601         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3602         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3603         Color_Shirt[3] = 0.0f;
3604         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3605         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3606         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3607         {
3608                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3610         }
3611         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3612         {
3613                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3614         }
3615         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3616         {
3617                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3618                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3619                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3620                 Color_Diffuse[3] = 0.0f;
3621                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3622                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3623                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3624                 LightColor[3] = 0.0f;
3625                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630                 Color_Specular[3] = 0.0f;
3631                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3633                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634                 for (x = startx;x < endx;x++)
3635                 {
3636                         z = buffer_z[x];
3637                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3638                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3639                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3640                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3641                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3642                         {
3643                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3644                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3645                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3646                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3647                         }
3648                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3649                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3650                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3651                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3652                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3653                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3654                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3655                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3656
3657                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3658                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3659                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3660                         DPSOFTRAST_Vector3Normalize(lightnormal);
3661
3662                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3663                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3664                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3665                         DPSOFTRAST_Vector3Normalize(eyenormal);
3666
3667                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3668                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3669                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3670                         DPSOFTRAST_Vector3Normalize(specularnormal);
3671
3672                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3673                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3674                         specular = pow(specular, SpecularPower * glosstex[3]);
3675                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3676                         {
3677                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3678                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3679                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3680                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3681                         }
3682                         else
3683                         {
3684                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3685                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3686                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3687                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3688                         }
3689                         buffer_FragColorbgra8[x*4+0] = d[0];
3690                         buffer_FragColorbgra8[x*4+1] = d[1];
3691                         buffer_FragColorbgra8[x*4+2] = d[2];
3692                         buffer_FragColorbgra8[x*4+3] = d[3];
3693                 }
3694         }
3695         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3696         {
3697                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700                 Color_Diffuse[3] = 0.0f;
3701                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3702                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3703                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3704                 LightColor[3] = 0.0f;
3705                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3706                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3707                 for (x = startx;x < endx;x++)
3708                 {
3709                         z = buffer_z[x];
3710                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3711                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3712                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3713                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3714                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3717                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3718
3719                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3720                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3721                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3722                         DPSOFTRAST_Vector3Normalize(lightnormal);
3723
3724                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3725                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3726                         {
3727                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3728                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3729                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3730                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3731                         }
3732                         else
3733                         {
3734                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3735                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3736                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3737                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3738                         }
3739                         buffer_FragColorbgra8[x*4+0] = d[0];
3740                         buffer_FragColorbgra8[x*4+1] = d[1];
3741                         buffer_FragColorbgra8[x*4+2] = d[2];
3742                         buffer_FragColorbgra8[x*4+3] = d[3];
3743                 }
3744         }
3745         else
3746         {
3747                 for (x = startx;x < endx;x++)
3748                 {
3749                         z = buffer_z[x];
3750                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3751                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3752                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3753                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3754
3755                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3756                         {
3757                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3758                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3759                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3760                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3761                         }
3762                         else
3763                         {
3764                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3765                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3766                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3767                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3768                         }
3769                         buffer_FragColorbgra8[x*4+0] = d[0];
3770                         buffer_FragColorbgra8[x*4+1] = d[1];
3771                         buffer_FragColorbgra8[x*4+2] = d[2];
3772                         buffer_FragColorbgra8[x*4+3] = d[3];
3773                 }
3774         }
3775         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3776 }
3777
3778
3779
3780 void DPSOFTRAST_VertexShader_LightSource(void)
3781 {
3782         int i;
3783         int numvertices = dpsoftrast.numvertices;
3784         float LightPosition[4];
3785         float LightVector[4];
3786         float LightVectorModelSpace[4];
3787         float EyePosition[4];
3788         float EyeVectorModelSpace[4];
3789         float EyeVector[4];
3790         float position[4];
3791         float svector[4];
3792         float tvector[4];
3793         float normal[4];
3794         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3795         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3796         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3797         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3798         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3799         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3800         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3801         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3802         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3803         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3804         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3805         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3806         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3807         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3808         for (i = 0;i < numvertices;i++)
3809         {
3810                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3811                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3812                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3813                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3814                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3815                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3816                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3817                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3818                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3819                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3820                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3821                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3822                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3823                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3824                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3825                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3826                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3827                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3828                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3829                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3830                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3831                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3832                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3833                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3834                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3835                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3836                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3837                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3838                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3839                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3840                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3841                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3842         }
3843         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3844         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3845 }
3846
3847 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3848 {
3849 #ifdef SSE2_PRESENT
3850         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3851         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3852         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3853         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858         int x, startx = span->startx, endx = span->endx;
3859         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3860         float CubeVectordata[4];
3861         float CubeVectorslope[4];
3862         float LightVectordata[4];
3863         float LightVectorslope[4];
3864         float EyeVectordata[4];
3865         float EyeVectorslope[4];
3866         float z;
3867         float diffusetex[4];
3868         float glosstex[4];
3869         float surfacenormal[4];
3870         float lightnormal[4];
3871         float eyenormal[4];
3872         float specularnormal[4];
3873         float diffuse;
3874         float specular;
3875         float SpecularPower;
3876         float CubeVector[4];
3877         float attenuation;
3878         int d[4];
3879         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3880         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3881         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3882         Color_Glow[3] = 0.0f;
3883         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3884         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3885         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3886         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3887         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890         Color_Diffuse[3] = 0.0f;
3891         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3892         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3893         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3894         Color_Specular[3] = 0.0f;
3895         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3896         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3897         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3898         Color_Pants[3] = 0.0f;
3899         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3900         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3901         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3902         Color_Shirt[3] = 0.0f;
3903         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3904         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3905         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3906         LightColor[3] = 0.0f;
3907         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3908         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3909         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3910         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3911         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3912         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3913         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3914         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3915         {
3916                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3918         }
3919         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3920                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3921         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3922         {
3923                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925                 for (x = startx;x < endx;x++)
3926                 {
3927                         z = buffer_z[x];
3928                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932                         if (attenuation < 0.01f)
3933                                 continue;
3934                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3935                         {
3936                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937                                 if (attenuation < 0.01f)
3938                                         continue;
3939                         }
3940
3941                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3946                         {
3947                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3951                         }
3952                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3953                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3954                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3955                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3956                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3960
3961                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3962                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3963                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3964                         DPSOFTRAST_Vector3Normalize(lightnormal);
3965
3966                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3967                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3968                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3969                         DPSOFTRAST_Vector3Normalize(eyenormal);
3970
3971                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3972                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3973                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3974                         DPSOFTRAST_Vector3Normalize(specularnormal);
3975
3976                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3978                         specular = pow(specular, SpecularPower * glosstex[3]);
3979                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3980                         {
3981                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3982                                 attenuation *= (1.0f / 255.0f);
3983                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3984                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3985                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3986                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3987                         }
3988                         else
3989                         {
3990                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3991                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3992                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3993                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3994                         }
3995                         buffer_FragColorbgra8[x*4+0] = d[0];
3996                         buffer_FragColorbgra8[x*4+1] = d[1];
3997                         buffer_FragColorbgra8[x*4+2] = d[2];
3998                         buffer_FragColorbgra8[x*4+3] = d[3];
3999                 }
4000         }
4001         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4002         {
4003                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4004                 for (x = startx;x < endx;x++)
4005                 {
4006                         z = buffer_z[x];
4007                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4008                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4009                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4010                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4011                         if (attenuation < 0.01f)
4012                                 continue;
4013                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4014                         {
4015                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4016                                 if (attenuation < 0.01f)
4017                                         continue;
4018                         }
4019
4020                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4021                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4022                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4023                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4025                         {
4026                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4027                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4028                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4029                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4030                         }
4031                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4032                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4033                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4034                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4035
4036                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4037                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4038                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4039                         DPSOFTRAST_Vector3Normalize(lightnormal);
4040
4041                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4042                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4043                         {
4044                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4045                                 attenuation *= (1.0f / 255.0f);
4046                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4047                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4048                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4049                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4050                         }
4051                         else
4052                         {
4053                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4054                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4055                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4056                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4057                         }
4058                         buffer_FragColorbgra8[x*4+0] = d[0];
4059                         buffer_FragColorbgra8[x*4+1] = d[1];
4060                         buffer_FragColorbgra8[x*4+2] = d[2];
4061                         buffer_FragColorbgra8[x*4+3] = d[3];
4062                 }
4063         }
4064         else
4065         {
4066                 for (x = startx;x < endx;x++)
4067                 {
4068                         z = buffer_z[x];
4069                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4070                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4071                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4072                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4073                         if (attenuation < 0.01f)
4074                                 continue;
4075                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4076                         {
4077                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4078                                 if (attenuation < 0.01f)
4079                                         continue;
4080                         }
4081
4082                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4083                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4084                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4085                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4086                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4087                         {
4088                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4089                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4090                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4091                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4092                         }
4093                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4094                         {
4095                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096                                 attenuation *= (1.0f / 255.0f);
4097                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4101                         }
4102                         else
4103                         {
4104                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4105                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4106                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4107                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4108                         }
4109                         buffer_FragColorbgra8[x*4+0] = d[0];
4110                         buffer_FragColorbgra8[x*4+1] = d[1];
4111                         buffer_FragColorbgra8[x*4+2] = d[2];
4112                         buffer_FragColorbgra8[x*4+3] = d[3];
4113                 }
4114         }
4115         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4116 #endif
4117 }
4118
4119
4120
4121 void DPSOFTRAST_VertexShader_Refraction(void)
4122 {
4123         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4124 }
4125
4126 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4127 {
4128         // TODO: IMPLEMENT
4129         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4134 }
4135
4136
4137
4138 void DPSOFTRAST_VertexShader_Water(void)
4139 {
4140         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4141 }
4142
4143
4144 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4145 {
4146         // TODO: IMPLEMENT
4147         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4148         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4150         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4151         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4152 }
4153
4154
4155
4156 void DPSOFTRAST_VertexShader_ShowDepth(void)
4157 {
4158         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4159 }
4160
4161 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4162 {
4163         // TODO: IMPLEMENT
4164         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4165         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4166         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4167         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4168         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4169 }
4170
4171
4172
4173 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4174 {
4175         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4176 }
4177
4178 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4179 {
4180         // TODO: IMPLEMENT
4181         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4182         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4183         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4184         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4185         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4186 }
4187
4188
4189
4190 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4191 {
4192         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4193 }
4194
4195 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4196 {
4197         // TODO: IMPLEMENT
4198         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4202         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4203 }
4204
4205
4206
4207 typedef struct DPSOFTRAST_ShaderModeInfo_s
4208 {
4209         int lodarrayindex;
4210         void (*Vertex)(void);
4211         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4212         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4213         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4214 }
4215 DPSOFTRAST_ShaderModeInfo;
4216
4217 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4218 {
4219         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4220         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4221         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4222         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4223         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4224         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4225         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4226         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4227         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4228         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4229         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4230         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4231         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4232         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4233         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4234         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4235 };
4236
4237 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4238 {
4239         int i;
4240         int x;
4241         int startx;
4242         int endx;
4243 //      unsigned int c;
4244 //      unsigned int *colorpixel;
4245         unsigned int *depthpixel;
4246         float w;
4247         float wslope;
4248         int depth;
4249         int depthslope;
4250         unsigned int d;
4251         DPSOFTRAST_State_Triangle *triangle;
4252         DPSOFTRAST_State_Span *span;
4253         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4254         for (i = 0; i < thread->numspans; i++)
4255         {
4256                 span = &thread->spans[i];
4257                 triangle = &thread->triangles[span->triangle];
4258                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4259                 {
4260                         wslope = triangle->w[0];
4261                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4262                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4263                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4264                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4265                         startx = span->startx;
4266                         endx = span->endx;
4267                         switch(thread->fb_depthfunc)
4268                         {
4269                         default:
4270                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4271                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4272                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4273                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4274                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4275                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4276                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4277                         }
4278                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4279                         //for (x = startx;x < endx;x++)
4280                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4281                         // if there is no color buffer, skip pixel shader
4282                         while (startx < endx && !pixelmask[startx])
4283                                 startx++;
4284                         while (endx > startx && !pixelmask[endx-1])
4285                                 endx--;
4286                         if (startx >= endx)
4287                                 continue; // no pixels to fill
4288                         span->pixelmask = pixelmask;
4289                         span->startx = startx;
4290                         span->endx = endx;
4291                         // run pixel shader if appropriate
4292                         // do this before running depthmask code, to allow the pixelshader
4293                         // to clear pixelmask values for alpha testing
4294                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4295                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4296                         if (thread->depthmask)
4297                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4298                                         if (pixelmask[x])
4299                                                 depthpixel[x] = d;
4300                 }
4301                 else
4302                 {
4303                         // no depth testing means we're just dealing with color...
4304                         // if there is no color buffer, skip pixel shader
4305                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4306                         {
4307                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4308                                 span->pixelmask = pixelmask;
4309                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4310                         }
4311                 }
4312         }
4313         thread->numspans = 0;
4314 }
4315
4316 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4317
4318 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4319 {
4320 #ifdef SSE2_PRESENT
4321         int cullface = thread->cullface;
4322         int minx, maxx, miny, maxy;
4323         int miny1, maxy1, miny2, maxy2;
4324         __m128i fbmin, fbmax;
4325         __m128 viewportcenter, viewportscale;
4326         int firstvertex = command->firstvertex;
4327         int numvertices = command->numvertices;
4328         int numtriangles = command->numtriangles;
4329         const int *element3i = command->element3i;
4330         const unsigned short *element3s = command->element3s;
4331         int clipped = command->clipped;
4332         int i;
4333         int j;
4334         int k;
4335         int y;
4336         int e[3];
4337         __m128i screeny;
4338         int starty, endy, bandy;
4339         int numpoints;
4340         int clipcase;
4341         float clipdist[4];
4342         __m128 triangleedge1, triangleedge2, trianglenormal;
4343         __m128 clipfrac[3];
4344         __m128 screen[4];
4345         DPSOFTRAST_State_Triangle *triangle;
4346         DPSOFTRAST_Texture *texture;
4347         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4348         miny = thread->fb_scissor[1];
4349         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4350         miny1 = bound(miny, thread->miny1, maxy);
4351         maxy1 = bound(miny, thread->maxy1, maxy);
4352         miny2 = bound(miny, thread->miny2, maxy);
4353         maxy2 = bound(miny, thread->maxy2, maxy);
4354         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4355         {
4356                 if (!ATOMIC_DECREMENT(command->refcount))
4357                 {
4358                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4359                                 MM_FREE(command->arrays);
4360                 }
4361                 return;
4362         }
4363         minx = thread->fb_scissor[0];
4364         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4365         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4366         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4367         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4368         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4369         screen[3] = _mm_setzero_ps();
4370         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4371         for (i = 0;i < numtriangles;i++)
4372         {
4373                 const float *screencoord4f = command->arrays;
4374                 const float *arrays = screencoord4f + numvertices*4;
4375
4376                 // generate the 3 edges of this triangle
4377                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4378                 if (element3s)
4379                 {
4380                         e[0] = element3s[i*3+0] - firstvertex;
4381                         e[1] = element3s[i*3+1] - firstvertex;
4382                         e[2] = element3s[i*3+2] - firstvertex;
4383                 }
4384                 else if (element3i)
4385                 {
4386                         e[0] = element3i[i*3+0] - firstvertex;
4387                         e[1] = element3i[i*3+1] - firstvertex;
4388                         e[2] = element3i[i*3+2] - firstvertex;
4389                 }
4390                 else
4391                 {
4392                         e[0] = i*3+0;
4393                         e[1] = i*3+1;
4394                         e[2] = i*3+2;
4395                 }
4396
4397 #define SKIPBACKFACE \
4398                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4399                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4400                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4401                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4402                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4403                 switch(cullface) \
4404                 { \
4405                 case GL_BACK: \
4406                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4407                                 continue; \
4408                         break; \
4409                 case GL_FRONT: \
4410                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4411                                 continue; \
4412                         break; \
4413                 }
4414
4415 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4416                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4417                         { \
4418                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4419                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4420                         }
4421 #define CLIPPEDVERTEXCOPY(k,p1) \
4422                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4423
4424 #define GENATTRIBCOPY(attrib, p1) \
4425                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4426 #define GENATTRIBLERP(attrib, p1, p2) \
4427                 { \
4428                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4429                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4430                 }
4431 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4432                 switch(clipcase) \
4433                 { \
4434                 default: \
4435                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4436                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4437                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4438                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4439                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4440                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4441                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4442                 }
4443
4444                 if (! clipped)
4445                         goto notclipped;
4446
4447                 // calculate distance from nearplane
4448                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4449                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4450                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4451                 if (clipdist[0] >= 0.0f)
4452                 {
4453                         if (clipdist[1] >= 0.0f)
4454                         {
4455                                 if (clipdist[2] >= 0.0f)
4456                                 {
4457                                 notclipped:
4458                                         // triangle is entirely in front of nearplane
4459                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4460                                         SKIPBACKFACE;
4461                                         numpoints = 3;
4462                                         clipcase = 0;
4463                                 }
4464                                 else
4465                                 {
4466                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4467                                         SKIPBACKFACE;
4468                                         numpoints = 4;
4469                                         clipcase = 1;
4470                                 }
4471                         }
4472                         else
4473                         {
4474                                 if (clipdist[2] >= 0.0f)
4475                                 {
4476                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4477                                         SKIPBACKFACE;
4478                                         numpoints = 4;
4479                                         clipcase = 2;
4480                                 }
4481                                 else
4482                                 {
4483                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4484                                         SKIPBACKFACE;
4485                                         numpoints = 3;
4486                                         clipcase = 3;
4487                                 }
4488                         }
4489                 }
4490                 else if (clipdist[1] >= 0.0f)
4491                 {
4492                         if (clipdist[2] >= 0.0f)
4493                         {
4494                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4495                                 SKIPBACKFACE;
4496                                 numpoints = 4;
4497                                 clipcase = 4;
4498                         }
4499                         else
4500                         {
4501                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4502                                 SKIPBACKFACE;
4503                                 numpoints = 3;
4504                                 clipcase = 5;
4505                         }
4506                 }
4507                 else if (clipdist[2] >= 0.0f)
4508                 {
4509                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4510                         SKIPBACKFACE;
4511                         numpoints = 3;
4512                         clipcase = 6;
4513                 }
4514                 else continue; // triangle is entirely behind nearplane
4515
4516                 {
4517                         // calculate integer y coords for triangle points
4518                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4519                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4520                                         screenmin = _mm_min_epi16(screeni, screenir),
4521                                         screenmax = _mm_max_epi16(screeni, screenir);
4522                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4523                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4524                         screenmin = _mm_max_epi16(screenmin, fbmin);
4525                         screenmax = _mm_min_epi16(screenmax, fbmax);
4526                         // skip offscreen triangles
4527                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4528                                 continue;
4529                         starty = _mm_extract_epi16(screenmin, 1);
4530                         endy = _mm_extract_epi16(screenmax, 1)+1;
4531                         if (starty >= maxy1 && endy <= miny2)
4532                                 continue;
4533                         screeny = _mm_srai_epi32(screeni, 16);
4534                 }
4535
4536                 triangle = &thread->triangles[thread->numtriangles];
4537
4538                 // calculate attribute plans for triangle data...
4539                 // okay, this triangle is going to produce spans, we'd better project
4540                 // the interpolants now (this is what gives perspective texturing),
4541                 // this consists of simply multiplying all arrays by the W coord
4542                 // (which is basically 1/Z), which will be undone per-pixel
4543                 // (multiplying by Z again) to get the perspective-correct array
4544                 // values
4545                 {
4546                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4547                         __m128 mipedgescale, mipdensity;
4548                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4549                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4550                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4551                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4552                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4553                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4554                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4555                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4556                         attribedge1 = _mm_sub_ss(w0, w1);
4557                         attribedge2 = _mm_sub_ss(w2, w1);
4558                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4559                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4560                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4561                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4562                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4563                         _mm_store_ss(&triangle->w[0], attribxslope);
4564                         _mm_store_ss(&triangle->w[1], attribyslope);
4565                         _mm_store_ss(&triangle->w[2], attriborigin);
4566                         mipedgescale = _mm_setzero_ps();
4567                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4568                         {
4569                                 __m128 attrib0, attrib1, attrib2;
4570                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4571                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4572                                         break;
4573                                 arrays += numvertices*4;
4574                                 GENATTRIBS(attrib0, attrib1, attrib2);
4575                                 attriborigin = _mm_mul_ps(attrib1, w1);
4576                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4577                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4578                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4579                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4580                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4581                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4582                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4583                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4584                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4585                                 {
4586                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4587                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4588                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4589                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4590                                 }
4591                         }
4592
4593                         memset(triangle->mip, 0, sizeof(triangle->mip));
4594                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4595                         {
4596                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4597                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4598                                         break;
4599                                 texture = thread->texbound[texunit];
4600                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4601                                 {
4602                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4603                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4604                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4605                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4606                                         // this will be multiplied in the texturing routine by the texture resolution
4607                                         y = _mm_cvtss_si32(mipdensity);
4608                                         if (y > 0)
4609                                         {
4610                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4611                                                 if (y > texture->mipmaps - 1)
4612                                                         y = texture->mipmaps - 1;
4613                                                 triangle->mip[texunit] = y;
4614                                         }
4615                                 }
4616                         }
4617                 }
4618         
4619                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4620                 for (; y < bandy;)
4621                 {
4622                         __m128 xcoords, xslope;
4623                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4624                         int yccmask = _mm_movemask_epi8(ycc);
4625                         int edge0p, edge0n, edge1p, edge1n;
4626                         int nexty;
4627                         if (numpoints == 4)
4628                         {
4629                                 switch(yccmask)
4630                                 {
4631                                 default:
4632                                 case 0xFFFF: /*0000*/ y = endy; continue;
4633                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4634                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4635                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4636                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4637                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4638                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4639                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4640                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4641                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4642                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4643                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4644                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4645                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4646                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4647                                 case 0x0000: /*1111*/ y++; continue;
4648                                 }
4649                         }
4650                         else
4651                         {
4652                                 switch(yccmask)
4653                                 {
4654                                 default:
4655                                 case 0xFFFF: /*000*/ y = endy; continue;
4656                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4657                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4658                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4659                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4660                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4661                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4662                                 case 0x0000: /*111*/ y++; continue;
4663                                 }
4664                         }
4665                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4666                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4667                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4668                         nexty = _mm_extract_epi16(ycc, 0);
4669                         if (nexty >= bandy) nexty = bandy-1;
4670                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4671                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4672                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4673                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4674                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4675                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4676                         {
4677                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4678                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4679                         }
4680                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4681                         {
4682                                 int startx, endx, offset;
4683                                 startx = _mm_cvtss_si32(xcoords);
4684                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4685                                 if (startx < minx) 
4686                                 {
4687                                         if (startx < 0) startx = 0;
4688                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4689                                 }
4690                                 if (endx > maxx) endx = maxx;
4691                                 if (startx >= endx) continue;
4692                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4693                                 {
4694                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4695                                         span->triangle = thread->numtriangles;
4696                                         span->x = offset;
4697                                         span->y = y;
4698                                         span->startx = max(minx - offset, 0);
4699                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4700                                         if (span->startx >= span->endx)
4701                                                 continue; 
4702                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4703                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4704                                 }
4705                         }
4706                 }
4707
4708                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4709                 {
4710                         DPSOFTRAST_Draw_ProcessSpans(thread);
4711                         thread->numtriangles = 0;
4712                 }
4713         }
4714
4715         if (!ATOMIC_DECREMENT(command->refcount))
4716         {
4717                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4718                         MM_FREE(command->arrays);
4719         }
4720
4721         if (thread->numspans > 0 || thread->numtriangles > 0)
4722         {
4723                 DPSOFTRAST_Draw_ProcessSpans(thread);
4724                 thread->numtriangles = 0;
4725         }
4726 #endif
4727 }
4728
4729 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4730 {
4731         int i;
4732         int j;
4733         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4734         int datasize = 2*numvertices*sizeof(float[4]);
4735         DPSOFTRAST_Command_Draw *command;
4736         unsigned char *data;
4737         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4738         {
4739                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4740                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4741                         break;
4742                 datasize += numvertices*sizeof(float[4]);
4743         }
4744         if (element3s)
4745                 datasize += numtriangles*sizeof(unsigned short[3]);
4746         else if (element3i)
4747                 datasize += numtriangles*sizeof(int[3]);
4748         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4749         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4750         {
4751                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4752                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4753         }
4754         else
4755         {
4756                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4757                 data = (unsigned char *)command + commandsize;
4758         }
4759         command->firstvertex = firstvertex;
4760         command->numvertices = numvertices;
4761         command->numtriangles = numtriangles;
4762         command->arrays = (float *)data;
4763         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4764         dpsoftrast.firstvertex = firstvertex;
4765         dpsoftrast.numvertices = numvertices;
4766         dpsoftrast.screencoord4f = (float *)data;
4767         data += numvertices*sizeof(float[4]);
4768         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4769         data += numvertices*sizeof(float[4]);
4770         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4771         {
4772                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4773                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4774                         break;
4775                 dpsoftrast.post_array4f[j] = (float *)data;
4776                 data += numvertices*sizeof(float[4]);
4777         }
4778         command->element3i = NULL;
4779         command->element3s = NULL;
4780         if (element3s)
4781         {
4782                 command->element3s = (unsigned short *)data;
4783                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4784         }
4785         else if (element3i)
4786         {
4787                 command->element3i = (int *)data;
4788                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4789         }
4790         return command;
4791 }
4792
4793 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4794 {
4795         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4796         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4797         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4798         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4799         if (command->starty >= command->endy)
4800         {
4801                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4802                         MM_FREE(command->arrays);
4803                 DPSOFTRAST_UndoCommand(command->commandsize);
4804                 return;
4805         }
4806         command->clipped = dpsoftrast.drawclipped;
4807         command->refcount = dpsoftrast.numthreads;
4808
4809         if (dpsoftrast.usethreads)
4810         {
4811                 int i;
4812                 DPSOFTRAST_Draw_SyncCommands();
4813                 for (i = 0; i < dpsoftrast.numthreads; i++)
4814                 {
4815                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4816                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4817                                 Thread_CondSignal(thread->drawcond);
4818                 }
4819         }
4820         else
4821         {
4822                 DPSOFTRAST_Draw_FlushThreads();
4823         }
4824 }
4825  
4826 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4827 {
4828         int commandoffset = thread->commandoffset;
4829         while (commandoffset != endoffset)
4830         {
4831                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4832                 switch (command->opcode)
4833                 {
4834 #define INTERPCOMMAND(name) \
4835                 case DPSOFTRAST_OPCODE_##name : \
4836                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4837                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4838                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4839                                 commandoffset = 0; \
4840                         break;
4841                 INTERPCOMMAND(Viewport)
4842                 INTERPCOMMAND(ClearColor)
4843                 INTERPCOMMAND(ClearDepth)
4844                 INTERPCOMMAND(ColorMask)
4845                 INTERPCOMMAND(DepthTest)
4846                 INTERPCOMMAND(ScissorTest)
4847                 INTERPCOMMAND(Scissor)
4848                 INTERPCOMMAND(BlendFunc)
4849                 INTERPCOMMAND(BlendSubtract)
4850                 INTERPCOMMAND(DepthMask)
4851                 INTERPCOMMAND(DepthFunc)
4852                 INTERPCOMMAND(DepthRange)
4853                 INTERPCOMMAND(PolygonOffset)
4854                 INTERPCOMMAND(CullFace)
4855                 INTERPCOMMAND(AlphaTest)
4856                 INTERPCOMMAND(AlphaFunc)
4857                 INTERPCOMMAND(SetTexture)
4858                 INTERPCOMMAND(SetShader)
4859                 INTERPCOMMAND(Uniform4f)
4860                 INTERPCOMMAND(UniformMatrix4f)
4861                 INTERPCOMMAND(Uniform1i)
4862
4863                 case DPSOFTRAST_OPCODE_Draw:
4864                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4865                         commandoffset += command->commandsize;
4866                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4867                                 commandoffset = 0;
4868                         thread->commandoffset = commandoffset;
4869                         break;
4870
4871                 case DPSOFTRAST_OPCODE_Reset:
4872                         commandoffset = 0;
4873                         break;
4874                 }
4875         }
4876         thread->commandoffset = commandoffset;
4877 }
4878
4879 static int DPSOFTRAST_Draw_Thread(void *data)
4880 {
4881         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4882         while(thread->index >= 0)
4883         {
4884                 if (thread->commandoffset != dpsoftrast.drawcommand)
4885                 {
4886                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4887                 }
4888                 else 
4889                 {
4890                         Thread_LockMutex(thread->drawmutex);
4891                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4892                         {
4893                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4894                                 thread->starving = true;
4895                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4896                                 thread->starving = false;
4897                         }
4898                         Thread_UnlockMutex(thread->drawmutex);
4899                 }
4900         }   
4901         return 0;
4902 }
4903
4904 static void DPSOFTRAST_Draw_FlushThreads(void)
4905 {
4906         DPSOFTRAST_State_Thread *thread;
4907         int i;
4908         DPSOFTRAST_Draw_SyncCommands();
4909         if (dpsoftrast.usethreads) 
4910         {
4911                 for (i = 0; i < dpsoftrast.numthreads; i++)
4912                 {
4913                         thread = &dpsoftrast.threads[i];
4914                         if (thread->commandoffset != dpsoftrast.drawcommand)
4915                         {
4916                                 Thread_LockMutex(thread->drawmutex);
4917                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4918                                         Thread_CondSignal(thread->drawcond);
4919                                 Thread_UnlockMutex(thread->drawmutex);
4920                         }
4921                 }
4922                 for (i = 0; i < dpsoftrast.numthreads; i++)
4923                 {
4924                         thread = &dpsoftrast.threads[i];
4925                         if (thread->commandoffset != dpsoftrast.drawcommand)
4926                         {
4927                                 Thread_LockMutex(thread->drawmutex);
4928                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4929                                 {
4930                                         thread->waiting = true;
4931                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4932                                         thread->waiting = false;
4933                                 }
4934                                 Thread_UnlockMutex(thread->drawmutex);
4935                         }
4936                 }
4937         }
4938         else
4939         {
4940                 for (i = 0; i < dpsoftrast.numthreads; i++)
4941                 {
4942                         thread = &dpsoftrast.threads[i];
4943                         if (thread->commandoffset != dpsoftrast.drawcommand)
4944                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4945                 }
4946         }
4947         dpsoftrast.commandpool.usedcommands = 0;
4948 }
4949
4950 void DPSOFTRAST_Flush(void)
4951 {
4952         DPSOFTRAST_Draw_FlushThreads();
4953 }
4954
4955 void DPSOFTRAST_Finish(void)
4956 {
4957         DPSOFTRAST_Flush();
4958 }
4959
4960 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4961 {
4962         int i;
4963         union
4964         {
4965                 int i;
4966                 unsigned char b[4];
4967         }
4968         u;
4969         u.i = 1;
4970         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4971         dpsoftrast.bigendian = u.b[3];
4972         dpsoftrast.fb_width = width;
4973         dpsoftrast.fb_height = height;
4974         dpsoftrast.fb_depthpixels = depthpixels;
4975         dpsoftrast.fb_colorpixels[0] = colorpixels;
4976         dpsoftrast.fb_colorpixels[1] = NULL;
4977         dpsoftrast.fb_colorpixels[1] = NULL;
4978         dpsoftrast.fb_colorpixels[1] = NULL;
4979         dpsoftrast.viewport[0] = 0;
4980         dpsoftrast.viewport[1] = 0;
4981         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4982         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4983         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4984         dpsoftrast.texture_firstfree = 1;
4985         dpsoftrast.texture_end = 1;
4986         dpsoftrast.texture_max = 0;
4987         dpsoftrast.color[0] = 1;
4988         dpsoftrast.color[1] = 1;
4989         dpsoftrast.color[2] = 1;
4990         dpsoftrast.color[3] = 1;
4991         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4992         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4993         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4994         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4995         for (i = 0; i < dpsoftrast.numthreads; i++)
4996         {
4997                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4998                 thread->index = i;
4999                 thread->cullface = GL_BACK;
5000                 thread->colormask[1] = 1;
5001                 thread->colormask[2] = 1;
5002                 thread->colormask[3] = 1;
5003                 thread->blendfunc[0] = GL_ONE;
5004                 thread->blendfunc[1] = GL_ZERO;
5005                 thread->depthmask = true;
5006                 thread->depthtest = true;
5007                 thread->depthfunc = GL_LEQUAL;
5008                 thread->scissortest = false;
5009                 thread->alphatest = false;
5010                 thread->alphafunc = GL_GREATER;
5011                 thread->alphavalue = 0.5f;
5012                 thread->viewport[0] = 0;
5013                 thread->viewport[1] = 0;
5014                 thread->viewport[2] = dpsoftrast.fb_width;
5015                 thread->viewport[3] = dpsoftrast.fb_height;
5016                 thread->scissor[0] = 0;
5017                 thread->scissor[1] = 0;
5018                 thread->scissor[2] = dpsoftrast.fb_width;
5019                 thread->scissor[3] = dpsoftrast.fb_height;
5020                 thread->depthrange[0] = 0;
5021                 thread->depthrange[1] = 1;
5022                 thread->polygonoffset[0] = 0;
5023                 thread->polygonoffset[1] = 0;
5024         
5025                 if (dpsoftrast.interlace)
5026                 {
5027                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5028                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5029                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5030                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5031                 }
5032                 else
5033                 {
5034                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5035                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5036                 }
5037
5038                 thread->numspans = 0;
5039                 thread->numtriangles = 0;
5040                 thread->commandoffset = 0;
5041                 thread->waiting = false;
5042                 thread->starving = false;
5043            
5044                 thread->validate = -1;
5045                 DPSOFTRAST_Validate(thread, -1);
5046  
5047                 if (dpsoftrast.usethreads)
5048                 {
5049                         thread->waitcond = Thread_CreateCond();
5050                         thread->drawcond = Thread_CreateCond();
5051                         thread->drawmutex = Thread_CreateMutex();
5052                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5053                 }
5054         }
5055         return 0;
5056 }
5057
5058 void DPSOFTRAST_Shutdown(void)
5059 {
5060         int i;
5061         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5062         {
5063                 DPSOFTRAST_State_Thread *thread;
5064                 for (i = 0; i < dpsoftrast.numthreads; i++)
5065                 {
5066                         thread = &dpsoftrast.threads[i];
5067                         Thread_LockMutex(thread->drawmutex);
5068                         thread->index = -1;
5069                         Thread_CondSignal(thread->drawcond);
5070                         Thread_UnlockMutex(thread->drawmutex);
5071                         Thread_WaitThread(thread->thread, 0);
5072                         Thread_DestroyCond(thread->waitcond);
5073                         Thread_DestroyCond(thread->drawcond);
5074                         Thread_DestroyMutex(thread->drawmutex);
5075                 }
5076         }
5077         for (i = 0;i < dpsoftrast.texture_end;i++)
5078                 if (dpsoftrast.texture[i].bytes)
5079                         MM_FREE(dpsoftrast.texture[i].bytes);
5080         if (dpsoftrast.texture)
5081                 free(dpsoftrast.texture);
5082         if (dpsoftrast.threads)
5083                 MM_FREE(dpsoftrast.threads);
5084         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5085 }
5086