1e9722b58545dbc20d3e14a09ca4ec872775ecee
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int bh;
1154         int x;
1155         int y;
1156         unsigned char *inpixels;
1157         unsigned char *b;
1158         unsigned char *o;
1159         DPSOFTRAST_Flush();
1160         if (bx1 < 0) bx1 = 0;
1161         if (by1 < 0) by1 = 0;
1162         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1163         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1164         bw = bx2 - bx1;
1165         bh = by2 - by1;
1166         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1167         if (dpsoftrast.bigendian)
1168         {
1169                 for (y = by1;y < by2;y++)
1170                 {
1171                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1172                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1173                         for (x = bx1;x < bx2;x++)
1174                         {
1175                                 o[0] = b[3];
1176                                 o[1] = b[2];
1177                                 o[2] = b[1];
1178                                 o[3] = b[0];
1179                                 o += 4;
1180                                 b += 4;
1181                         }
1182                 }
1183         }
1184         else
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         memcpy(o, b, bw*4);
1191                 }
1192         }
1193
1194 }
1195 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1196 {
1197         int tx1 = tx;
1198         int ty1 = ty;
1199         int tx2 = tx + width;
1200         int ty2 = ty + height;
1201         int sx1 = sx;
1202         int sy1 = sy;
1203         int sx2 = sx + width;
1204         int sy2 = sy + height;
1205         int swidth;
1206         int sheight;
1207         int twidth;
1208         int theight;
1209         int sw;
1210         int sh;
1211         int tw;
1212         int th;
1213         int y;
1214         unsigned int *spixels;
1215         unsigned int *tpixels;
1216         DPSOFTRAST_Texture *texture;
1217         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1218         if (mip < 0 || mip >= texture->mipmaps) return;
1219         DPSOFTRAST_Flush();
1220         spixels = dpsoftrast.fb_colorpixels[0];
1221         swidth = dpsoftrast.fb_width;
1222         sheight = dpsoftrast.fb_height;
1223         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1224         twidth = texture->mipmap[mip][2];
1225         theight = texture->mipmap[mip][3];
1226         if (tx1 < 0) tx1 = 0;
1227         if (ty1 < 0) ty1 = 0;
1228         if (tx2 > twidth) tx2 = twidth;
1229         if (ty2 > theight) ty2 = theight;
1230         if (sx1 < 0) sx1 = 0;
1231         if (sy1 < 0) sy1 = 0;
1232         if (sx2 > swidth) sx2 = swidth;
1233         if (sy2 > sheight) sy2 = sheight;
1234         tw = tx2 - tx1;
1235         th = ty2 - ty1;
1236         sw = sx2 - sx1;
1237         sh = sy2 - sy1;
1238         if (tw > sw) tw = sw;
1239         if (th > sh) th = sh;
1240         if (tw < 1 || th < 1)
1241                 return;
1242         for (y = 0;y < th;y++)
1243                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1244         if (texture->mipmaps > 1)
1245                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1246 }
1247
1248 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1249 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1250 {
1251         if (thread->texbound[command->unitnum])
1252                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1253         thread->texbound[command->unitnum] = command->texture;
1254 }
1255 void DPSOFTRAST_SetTexture(int unitnum, int index)
1256 {
1257         DPSOFTRAST_Command_SetTexture *command;
1258         DPSOFTRAST_Texture *texture;
1259         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1260         {
1261                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1262                 return;
1263         }
1264         texture = DPSOFTRAST_Texture_GetByIndex(index);
1265         if (index && !texture)
1266         {
1267                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1268                 return;
1269         }
1270
1271         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1272         command->unitnum = unitnum;
1273         command->texture = texture;
1274
1275         dpsoftrast.texbound[unitnum] = texture;
1276         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1277 }
1278
1279 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1280 {
1281         dpsoftrast.pointer_vertex3f = vertex3f;
1282         dpsoftrast.stride_vertex = stride;
1283 }
1284 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1285 {
1286         dpsoftrast.pointer_color4f = color4f;
1287         dpsoftrast.pointer_color4ub = NULL;
1288         dpsoftrast.stride_color = stride;
1289 }
1290 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1291 {
1292         dpsoftrast.pointer_color4f = NULL;
1293         dpsoftrast.pointer_color4ub = color4ub;
1294         dpsoftrast.stride_color = stride;
1295 }
1296 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1297 {
1298         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1299         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1300         dpsoftrast.stride_texcoord[unitnum] = stride;
1301 }
1302
1303 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1304 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1305 {
1306         thread->shader_mode = command->mode;
1307         thread->shader_permutation = command->permutation;
1308 }
1309 void DPSOFTRAST_SetShader(int mode, int permutation)
1310 {
1311         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1312         command->mode = mode;
1313         command->permutation = permutation;
1314
1315         dpsoftrast.shader_mode = mode;
1316         dpsoftrast.shader_permutation = permutation;
1317 }
1318
1319 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1320 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1321 {
1322         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1323 }
1324 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1325 {
1326         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1327         command->index = index;
1328         command->val[0] = v0;
1329         command->val[1] = v1;
1330         command->val[2] = v2;
1331         command->val[3] = v3;
1332
1333         dpsoftrast.uniform4f[index*4+0] = v0;
1334         dpsoftrast.uniform4f[index*4+1] = v1;
1335         dpsoftrast.uniform4f[index*4+2] = v2;
1336         dpsoftrast.uniform4f[index*4+3] = v3;
1337 }
1338 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1339 {
1340         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1341         command->index = index;
1342         memcpy(command->val, v, sizeof(command->val));
1343
1344         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1345 }
1346
1347 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1348 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1349 {
1350         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1351 }
1352 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1353 {
1354 #ifdef SSE2_PRESENT
1355         int i, index;
1356         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1357         {
1358                 __m128 m0, m1, m2, m3;
1359                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1360                 command->index = (DPSOFTRAST_UNIFORM)index;
1361                 if (((size_t)v)&(ALIGN_SIZE-1))
1362                 {
1363                         m0 = _mm_loadu_ps(v);
1364                         m1 = _mm_loadu_ps(v+4);
1365                         m2 = _mm_loadu_ps(v+8);
1366                         m3 = _mm_loadu_ps(v+12);
1367                 }
1368                 else
1369                 {
1370                         m0 = _mm_load_ps(v);
1371                         m1 = _mm_load_ps(v+4);
1372                         m2 = _mm_load_ps(v+8);
1373                         m3 = _mm_load_ps(v+12);
1374                 }
1375                 if (transpose)
1376                 {
1377                         __m128 t0, t1, t2, t3;
1378                         t0 = _mm_unpacklo_ps(m0, m1);
1379                         t1 = _mm_unpacklo_ps(m2, m3);
1380                         t2 = _mm_unpackhi_ps(m0, m1);
1381                         t3 = _mm_unpackhi_ps(m2, m3);
1382                         m0 = _mm_movelh_ps(t0, t1);
1383                         m1 = _mm_movehl_ps(t1, t0);
1384                         m2 = _mm_movelh_ps(t2, t3);
1385                         m3 = _mm_movehl_ps(t3, t2);                     
1386                 }
1387                 _mm_store_ps(command->val, m0);
1388                 _mm_store_ps(command->val+4, m1);
1389                 _mm_store_ps(command->val+8, m2);
1390                 _mm_store_ps(command->val+12, m3);
1391                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1393                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1394                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1395         }
1396 #endif
1397 }
1398
1399 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1400 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1401 {
1402         thread->uniform1i[command->index] = command->val;
1403 }
1404 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1405 {
1406         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1407         command->index = index;
1408         command->val = i0;
1409
1410         dpsoftrast.uniform1i[command->index] = i0;
1411 }
1412
1413 #ifdef SSE2_PRESENT
1414 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1415 {
1416         float *end = dst + size*4;
1417         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1418         {
1419                 while (dst < end)
1420                 {
1421                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1422                         dst += 4;
1423                         src += stride;
1424                 }
1425         }
1426         else
1427         {
1428                 while (dst < end)
1429                 {
1430                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1431                         dst += 4;
1432                         src += stride;
1433                 }
1434         }
1435 }
1436
1437 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1438 {
1439         float *end = dst + size*4;
1440         if (stride == sizeof(float[3]))
1441         {
1442                 float *end4 = dst + (size&~3)*4;        
1443                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1444                 {
1445                         while (dst < end4)
1446                         {
1447                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1448                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1449                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1450                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1452                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1453                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1455                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1456                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1459                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1460                                 dst += 16;
1461                                 src += 4*sizeof(float[3]);
1462                         }
1463                 }
1464                 else
1465                 {
1466                         while (dst < end4)
1467                         {
1468                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1469                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1481                                 dst += 16;
1482                                 src += 4*sizeof(float[3]);
1483                         }
1484                 }
1485         }
1486         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1487         {
1488                 while (dst < end)
1489                 {
1490                         __m128 v = _mm_loadu_ps((const float *)src);
1491                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1492                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1493                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1494                         _mm_store_ps(dst, v);
1495                         dst += 4;
1496                         src += stride;
1497                 }
1498         }
1499         else
1500         {
1501                 while (dst < end)
1502                 {
1503                         __m128 v = _mm_load_ps((const float *)src);
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507                         _mm_store_ps(dst, v);
1508                         dst += 4;
1509                         src += stride;
1510                 }
1511         }
1512 }
1513
1514 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1515 {
1516         float *end = dst + size*4;
1517         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1518         if (stride == sizeof(float[2]))
1519         {
1520                 float *end2 = dst + (size&~1)*4;
1521                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1522                 {
1523                         while (dst < end2)
1524                         {
1525                                 __m128 v = _mm_loadu_ps((const float *)src);
1526                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1527                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1528                                 dst += 8;
1529                                 src += 2*sizeof(float[2]);
1530                         }
1531                 }
1532                 else
1533                 {
1534                         while (dst < end2)
1535                         {
1536                                 __m128 v = _mm_load_ps((const float *)src);
1537                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1538                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1539                                 dst += 8;
1540                                 src += 2*sizeof(float[2]);
1541                         }
1542                 }
1543         }
1544         while (dst < end)
1545         {
1546                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1547                 dst += 4;
1548                 src += stride;
1549         }
1550 }
1551
1552 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1553 {
1554         float *end = dst + size*4;
1555         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1556         if (stride == sizeof(unsigned char[4]))
1557         {
1558                 float *end4 = dst + (size&~3)*4;
1559                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1560                 {
1561                         while (dst < end4)
1562                         {
1563                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1564                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1565                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1566                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1567                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1568                                 dst += 16;
1569                                 src += 4*sizeof(unsigned char[4]);
1570                         }
1571                 }
1572                 else
1573                 {
1574                         while (dst < end4)
1575                         {
1576                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1581                                 dst += 16;
1582                                 src += 4*sizeof(unsigned char[4]);
1583                         }
1584                 }
1585         }
1586         while (dst < end)
1587         {
1588                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1589                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1590                 dst += 4;
1591                 src += stride;
1592         }
1593 }
1594
1595 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1596 {
1597         float *end = dst + 4*size;
1598         __m128 v = _mm_loadu_ps(src);
1599         while (dst < end)
1600         {
1601                 _mm_store_ps(dst, v);
1602                 dst += 4;
1603         }
1604 }
1605 #endif
1606
1607 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1608 {
1609 #ifdef SSE2_PRESENT
1610         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1611         __m128 m0, m1, m2, m3;
1612         float *end;
1613         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1614         {
1615                 // fast case for identity matrix
1616                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1617                 return;
1618         }
1619         end = out4f + numitems*4;
1620         m0 = _mm_loadu_ps(inmatrix16f);
1621         m1 = _mm_loadu_ps(inmatrix16f + 4);
1622         m2 = _mm_loadu_ps(inmatrix16f + 8);
1623         m3 = _mm_loadu_ps(inmatrix16f + 12);
1624         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1625         {
1626                 while (out4f < end)
1627                 {
1628                         __m128 v = _mm_loadu_ps(in4f);
1629                         _mm_store_ps(out4f,
1630                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1631                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1632                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1633                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1634                         out4f += 4;
1635                         in4f += 4;
1636                 }
1637         }
1638         else
1639         {
1640                 while (out4f < end)
1641                 {
1642                         __m128 v = _mm_load_ps(in4f);
1643                         _mm_store_ps(out4f,
1644                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1645                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1646                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1647                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1648                         out4f += 4;
1649                         in4f += 4;
1650                 }
1651         }
1652 #endif
1653 }
1654
1655 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1656 {
1657         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1658 }
1659
1660 #ifdef SSE2_PRESENT
1661 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1662 { \
1663         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1664         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1665         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1666         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1667 }
1668
1669 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1678 { \
1679         __m128 p = (in); \
1680         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1681                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1682                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1683                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1684 }
1685
1686 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1687 {
1688         int clipmask = 0xFF;
1689         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1690         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1691         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1692         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1693         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1694         #define BBFRONT(k, pos) \
1695         { \
1696                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1697                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1698                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1699                 { \
1700                         __m128 proj; \
1701                         clipmask &= ~(1<<k); \
1702                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1703                         minproj = _mm_min_ss(minproj, proj); \
1704                         maxproj = _mm_max_ss(maxproj, proj); \
1705                 } \
1706         }
1707         BBFRONT(0, minpos); 
1708         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1709         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1710         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1712         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1713         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1714         BBFRONT(7, maxpos);
1715         #define BBCLIP(k) \
1716         { \
1717                 if (clipmask&(1<<k)) \
1718                 { \
1719                         if (!(clipmask&(1<<(k^1)))) \
1720                         { \
1721                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1722                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1723                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1724                                 minproj = _mm_min_ss(minproj, proj); \
1725                                 maxproj = _mm_max_ss(maxproj, proj); \
1726                         } \
1727                         if (!(clipmask&(1<<(k^2)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^4)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                 } \
1744         }
1745         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1746         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1747         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1748         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1749         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1750         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1751         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1752         *starty = _mm_cvttss_si32(maxproj);
1753         *endy = _mm_cvttss_si32(minproj)+1;
1754         return clipmask;
1755 }
1756         
1757 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1758 {
1759         float *end = out4f + numitems*4;
1760         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1761         __m128 minpos, maxpos;
1762         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1763         {
1764                 minpos = maxpos = _mm_loadu_ps(in4f);
1765                 while (out4f < end)
1766                 {
1767                         __m128 v = _mm_loadu_ps(in4f);
1768                         minpos = _mm_min_ps(minpos, v);
1769                         maxpos = _mm_max_ps(maxpos, v);
1770                         _mm_store_ps(out4f, v);
1771                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1772                         _mm_store_ps(screen4f, v);
1773                         in4f += 4;
1774                         out4f += 4;
1775                         screen4f += 4;
1776                 }
1777         }
1778         else
1779         {
1780                 minpos = maxpos = _mm_load_ps(in4f);
1781                 while (out4f < end)
1782                 {
1783                         __m128 v = _mm_load_ps(in4f);
1784                         minpos = _mm_min_ps(minpos, v);
1785                         maxpos = _mm_max_ps(maxpos, v);
1786                         _mm_store_ps(out4f, v);
1787                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1788                         _mm_store_ps(screen4f, v);
1789                         in4f += 4;
1790                         out4f += 4;
1791                         screen4f += 4;
1792                 }
1793         }
1794         if (starty && endy) 
1795                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1796                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1797                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1798                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1799                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1800         return 0;
1801 }
1802
1803 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1804 {
1805         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1806         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1807         float *end;
1808         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1809                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1810         end = out4f + numitems*4;
1811         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1812         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1813         m0 = _mm_loadu_ps(inmatrix16f);
1814         m1 = _mm_loadu_ps(inmatrix16f + 4);
1815         m2 = _mm_loadu_ps(inmatrix16f + 8);
1816         m3 = _mm_loadu_ps(inmatrix16f + 12);
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1826                         _mm_store_ps(out4f, v);
1827                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828                         _mm_store_ps(screen4f, v);
1829                         in4f += 4;
1830                         out4f += 4;
1831                         screen4f += 4;
1832                 }
1833         }
1834         else
1835         {
1836                 minpos = maxpos = _mm_load_ps(in4f);
1837                 while (out4f < end)
1838                 {
1839                         __m128 v = _mm_load_ps(in4f);
1840                         minpos = _mm_min_ps(minpos, v);
1841                         maxpos = _mm_max_ps(maxpos, v);
1842                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1843                         _mm_store_ps(out4f, v);
1844                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1845                         _mm_store_ps(screen4f, v);
1846                         in4f += 4;
1847                         out4f += 4;
1848                         screen4f += 4;
1849                 }
1850         }
1851         if (starty && endy) 
1852                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1853         return 0;
1854 }
1855 #endif
1856
1857 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1858 {
1859 #ifdef SSE2_PRESENT
1860         float *outf = dpsoftrast.post_array4f[outarray];
1861         const unsigned char *inb;
1862         int firstvertex = dpsoftrast.firstvertex;
1863         int numvertices = dpsoftrast.numvertices;
1864         int stride;
1865         switch(inarray)
1866         {
1867         case DPSOFTRAST_ARRAY_POSITION:
1868                 stride = dpsoftrast.stride_vertex;
1869                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1870                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1871                 break;
1872         case DPSOFTRAST_ARRAY_COLOR:
1873                 stride = dpsoftrast.stride_color;
1874                 if (dpsoftrast.pointer_color4f)
1875                 {
1876                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1877                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1878                 }
1879                 else if (dpsoftrast.pointer_color4ub)
1880                 {
1881                         stride = dpsoftrast.stride_color;
1882                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1883                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1884                 }
1885                 else
1886                 {
1887                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1888                 }
1889                 break;
1890         default:
1891                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1892                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1893                 {
1894                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1895                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1896                         {
1897                         case 2:
1898                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1899                                 break;
1900                         case 3:
1901                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1902                                 break;
1903                         case 4:
1904                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1905                                 break;
1906                         }
1907                 }
1908                 break;
1909         }
1910         return outf;
1911 #else
1912         return NULL;
1913 #endif
1914 }
1915
1916 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1917 {
1918         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1919         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1920         return data;
1921 }
1922
1923 #if 0
1924 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1925 {
1926 #ifdef SSE2_PRESENT
1927         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1928         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1929         return data;
1930 #else
1931         return NULL;
1932 #endif
1933 }
1934 #endif
1935
1936 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1937 {
1938 #ifdef SSE2_PRESENT
1939         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1940         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1941         return data;
1942 #else
1943         return NULL;
1944 #endif
1945 }
1946
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 {
1949         int x;
1950         int startx = span->startx;
1951         int endx = span->endx;
1952         float wslope = triangle->w[0];
1953         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954         float endz = 1.0f / (w + wslope * startx);
1955         for (x = startx;x < endx;)
1956         {
1957                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1958                 float z = endz, dz;
1959                 if (nextsub >= endx) nextsub = endsub = endx-1;
1960                 endz = 1.0f / (w + wslope * nextsub);
1961                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962                 for (; x <= endsub; x++, z += dz)
1963                         zf[x] = z;
1964         }
1965 }
1966
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 {
1969         int x;
1970         int startx = span->startx;
1971         int endx = span->endx;
1972         int d[4];
1973         float a, b;
1974         unsigned char * RESTRICT pixelmask = span->pixelmask;
1975         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976         if (!pixel)
1977                 return;
1978         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979         // handle alphatest now (this affects depth writes too)
1980         if (thread->alphatest)
1981                 for (x = startx;x < endx;x++)
1982                         if (in4f[x*4+3] < 0.5f)
1983                                 pixelmask[x] = false;
1984         // FIXME: this does not handle bigendian
1985         switch(thread->fb_blendmode)
1986         {
1987         case DPSOFTRAST_BLENDMODE_OPAQUE:
1988                 for (x = startx;x < endx;x++)
1989                 {
1990                         if (!pixelmask[x])
1991                                 continue;
1992                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_ALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = in4f[x*4+3] * 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020                 for (x = startx;x < endx;x++)
2021                 {
2022                         if (!pixelmask[x])
2023                                 continue;
2024                         a = in4f[x*4+3] * 255.0f;
2025                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ADD:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044                         pixel[x*4+0] = d[0];
2045                         pixel[x*4+1] = d[1];
2046                         pixel[x*4+2] = d[2];
2047                         pixel[x*4+3] = d[3];
2048                 }
2049                 break;
2050         case DPSOFTRAST_BLENDMODE_INVMOD:
2051                 for (x = startx;x < endx;x++)
2052                 {
2053                         if (!pixelmask[x])
2054                                 continue;
2055                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059                         pixel[x*4+0] = d[0];
2060                         pixel[x*4+1] = d[1];
2061                         pixel[x*4+2] = d[2];
2062                         pixel[x*4+3] = d[3];
2063                 }
2064                 break;
2065         case DPSOFTRAST_BLENDMODE_MUL:
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (!pixelmask[x])
2069                                 continue;
2070                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074                         pixel[x*4+0] = d[0];
2075                         pixel[x*4+1] = d[1];
2076                         pixel[x*4+2] = d[2];
2077                         pixel[x*4+3] = d[3];
2078                 }
2079                 break;
2080         case DPSOFTRAST_BLENDMODE_MUL2:
2081                 for (x = startx;x < endx;x++)
2082                 {
2083                         if (!pixelmask[x])
2084                                 continue;
2085                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089                         pixel[x*4+0] = d[0];
2090                         pixel[x*4+1] = d[1];
2091                         pixel[x*4+2] = d[2];
2092                         pixel[x*4+3] = d[3];
2093                 }
2094                 break;
2095         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096                 for (x = startx;x < endx;x++)
2097                 {
2098                         if (!pixelmask[x])
2099                                 continue;
2100                         a = in4f[x*4+3] * -255.0f;
2101                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105                         pixel[x*4+0] = d[0];
2106                         pixel[x*4+1] = d[1];
2107                         pixel[x*4+2] = d[2];
2108                         pixel[x*4+3] = d[3];
2109                 }
2110                 break;
2111         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112                 for (x = startx;x < endx;x++)
2113                 {
2114                         if (!pixelmask[x])
2115                                 continue;
2116                         a = 255.0f;
2117                         b = 1.0f - in4f[x*4+3];
2118                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_INVADD:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137                         pixel[x*4+0] = d[0];
2138                         pixel[x*4+1] = d[1];
2139                         pixel[x*4+2] = d[2];
2140                         pixel[x*4+3] = d[3];
2141                 }
2142                 break;
2143         }
2144 }
2145
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2147 {
2148 #ifdef SSE2_PRESENT
2149         int x;
2150         int startx = span->startx;
2151         int endx = span->endx;
2152         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153         unsigned char * RESTRICT pixelmask = span->pixelmask;
2154         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2156         if (!pixel)
2157                 return;
2158         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159         pixeli += span->y * dpsoftrast.fb_width + span->x;
2160         // handle alphatest now (this affects depth writes too)
2161         if (thread->alphatest)
2162                 for (x = startx;x < endx;x++)
2163                         if (in4ub[x*4+3] < 0.5f)
2164                                 pixelmask[x] = false;
2165         // FIXME: this does not handle bigendian
2166         switch(thread->fb_blendmode)
2167         {
2168         case DPSOFTRAST_BLENDMODE_OPAQUE:
2169                 for (x = startx;x + 4 <= endx;)
2170                 {
2171                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2172                         {
2173                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2174                                 x += 4;
2175                         }
2176                         else
2177                         {
2178                                 if (pixelmask[x])
2179                                         pixeli[x] = ini[x];
2180                                 x++;
2181                         }
2182                 }
2183                 for (;x < endx;x++)
2184                         if (pixelmask[x])
2185                                 pixeli[x] = ini[x];
2186                 break;
2187         case DPSOFTRAST_BLENDMODE_ALPHA:
2188         #define FINISHBLEND(blend2, blend1) \
2189                 for (x = startx;x + 1 < endx;x += 2) \
2190                 { \
2191                         __m128i src, dst; \
2192                         switch (*(const unsigned short*)&pixelmask[x]) \
2193                         { \
2194                         case 0x0101: \
2195                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2197                                 blend2; \
2198                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2199                                 continue; \
2200                         case 0x0100: \
2201                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2203                                 blend1; \
2204                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2205                                 continue; \
2206                         case 0x0001: \
2207                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2209                                 blend1; \
2210                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2211                                 continue; \
2212                         } \
2213                         break; \
2214                 } \
2215                 for(;x < endx; x++) \
2216                 { \
2217                         __m128i src, dst; \
2218                         if (!pixelmask[x]) \
2219                                 continue; \
2220                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                         blend1; \
2223                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                 }
2225
2226                 FINISHBLEND({
2227                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2229                 }, {
2230                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232                 });
2233                 break;
2234         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2235                 FINISHBLEND({
2236                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238                 }, {
2239                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241                 });
2242                 break;
2243         case DPSOFTRAST_BLENDMODE_ADD:
2244                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2245                 break;
2246         case DPSOFTRAST_BLENDMODE_INVMOD:
2247                 FINISHBLEND({
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 }, {
2250                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251                 });
2252                 break;
2253         case DPSOFTRAST_BLENDMODE_MUL:
2254                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2255                 break;
2256         case DPSOFTRAST_BLENDMODE_MUL2:
2257                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2258                 break;
2259         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2260                 FINISHBLEND({
2261                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                 }, {
2264                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                 });
2267                 break;
2268         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2269                 FINISHBLEND({
2270                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2272                 }, {
2273                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275                 });
2276                 break;
2277         case DPSOFTRAST_BLENDMODE_INVADD:
2278                 FINISHBLEND({
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 }, {
2281                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282                 });
2283                 break;
2284         }
2285 #endif
2286 }
2287
2288 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2289 {
2290         int x;
2291         int startx = span->startx;
2292         int endx = span->endx;
2293         int flags;
2294         float c[4];
2295         float data[4];
2296         float slope[4];
2297         float tc[2], endtc[2];
2298         float tcscale[2];
2299         unsigned int tci[2];
2300         unsigned int tci1[2];
2301         unsigned int tcimin[2];
2302         unsigned int tcimax[2];
2303         int tciwrapmask[2];
2304         int tciwidth;
2305         int filter;
2306         int mip;
2307         const unsigned char * RESTRICT pixelbase;
2308         const unsigned char * RESTRICT pixel[4];
2309         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2310         // if no texture is bound, just fill it with white
2311         if (!texture)
2312         {
2313                 for (x = startx;x < endx;x++)
2314                 {
2315                         out4f[x*4+0] = 1.0f;
2316                         out4f[x*4+1] = 1.0f;
2317                         out4f[x*4+2] = 1.0f;
2318                         out4f[x*4+3] = 1.0f;
2319                 }
2320                 return;
2321         }
2322         mip = triangle->mip[texunitindex];
2323         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2324         // if this mipmap of the texture is 1 pixel, just fill it with that color
2325         if (texture->mipmap[mip][1] == 4)
2326         {
2327                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2328                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2329                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2330                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2331                 for (x = startx;x < endx;x++)
2332                 {
2333                         out4f[x*4+0] = c[0];
2334                         out4f[x*4+1] = c[1];
2335                         out4f[x*4+2] = c[2];
2336                         out4f[x*4+3] = c[3];
2337                 }
2338                 return;
2339         }
2340         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2341         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2342         flags = texture->flags;
2343         tcscale[0] = texture->mipmap[mip][2];
2344         tcscale[1] = texture->mipmap[mip][3];
2345         tciwidth = texture->mipmap[mip][2];
2346         tcimin[0] = 0;
2347         tcimin[1] = 0;
2348         tcimax[0] = texture->mipmap[mip][2]-1;
2349         tcimax[1] = texture->mipmap[mip][3]-1;
2350         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2351         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2352         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2353         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2354         for (x = startx;x < endx;)
2355         {
2356                 unsigned int subtc[2];
2357                 unsigned int substep[2];
2358                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2359                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2360                 if (nextsub >= endx)
2361                 {
2362                         nextsub = endsub = endx-1;      
2363                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2364                 }
2365                 tc[0] = endtc[0];
2366                 tc[1] = endtc[1];
2367                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2368                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2369                 substep[0] = (endtc[0] - tc[0]) * subscale;
2370                 substep[1] = (endtc[1] - tc[1]) * subscale;
2371                 subtc[0] = tc[0] * (1<<16);
2372                 subtc[1] = tc[1] * (1<<16);
2373                 if (filter)
2374                 {
2375                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2376                         {
2377                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2378                                 {
2379                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2380                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2381                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2382                                         tci[0] = subtc[0]>>16;
2383                                         tci[1] = subtc[1]>>16;
2384                                         tci1[0] = tci[0] + 1;
2385                                         tci1[1] = tci[1] + 1;
2386                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2387                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2388                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2389                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2390                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2391                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2392                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2393                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2394                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2397                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2398                                         out4f[x*4+0] = c[0];
2399                                         out4f[x*4+1] = c[1];
2400                                         out4f[x*4+2] = c[2];
2401                                         out4f[x*4+3] = c[3];
2402                                 }
2403                         }
2404                         else
2405                         {
2406                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2407                                 {
2408                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2409                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2410                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2411                                         tci[0] = subtc[0]>>16;
2412                                         tci[1] = subtc[1]>>16;
2413                                         tci1[0] = tci[0] + 1;
2414                                         tci1[1] = tci[1] + 1;
2415                                         tci[0] &= tciwrapmask[0];
2416                                         tci[1] &= tciwrapmask[1];
2417                                         tci1[0] &= tciwrapmask[0];
2418                                         tci1[1] &= tciwrapmask[1];
2419                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2420                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2421                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2422                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2423                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2426                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2427                                         out4f[x*4+0] = c[0];
2428                                         out4f[x*4+1] = c[1];
2429                                         out4f[x*4+2] = c[2];
2430                                         out4f[x*4+3] = c[3];
2431                                 }
2432                         }
2433                 }
2434                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2435                 {
2436                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2437                         {
2438                                 tci[0] = subtc[0]>>16;
2439                                 tci[1] = subtc[1]>>16;
2440                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2443                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2444                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2445                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2446                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2447                                 out4f[x*4+0] = c[0];
2448                                 out4f[x*4+1] = c[1];
2449                                 out4f[x*4+2] = c[2];
2450                                 out4f[x*4+3] = c[3];
2451                         }
2452                 }
2453                 else
2454                 {
2455                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456                         {
2457                                 tci[0] = subtc[0]>>16;
2458                                 tci[1] = subtc[1]>>16;
2459                                 tci[0] &= tciwrapmask[0];
2460                                 tci[1] &= tciwrapmask[1];
2461                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2463                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2464                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2465                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2466                                 out4f[x*4+0] = c[0];
2467                                 out4f[x*4+1] = c[1];
2468                                 out4f[x*4+2] = c[2];
2469                                 out4f[x*4+3] = c[3];
2470                         }
2471                 }
2472         }
2473 }
2474
2475 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2476 {
2477 #ifdef SSE2_PRESENT
2478         int x;
2479         int startx = span->startx;
2480         int endx = span->endx;
2481         int flags;
2482         __m128 data, slope, tcscale;
2483         __m128i tcsize, tcmask, tcoffset, tcmax;
2484         __m128 tc, endtc;
2485         __m128i subtc, substep, endsubtc;
2486         int filter;
2487         int mip;
2488         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2489         const unsigned char * RESTRICT pixelbase;
2490         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2491         // if no texture is bound, just fill it with white
2492         if (!texture)
2493         {
2494                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2495                 return;
2496         }
2497         mip = triangle->mip[texunitindex];
2498         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499         // if this mipmap of the texture is 1 pixel, just fill it with that color
2500         if (texture->mipmap[mip][1] == 4)
2501         {
2502                 unsigned int k = *((const unsigned int *)pixelbase);
2503                 for (x = startx;x < endx;x++)
2504                         outi[x] = k;
2505                 return;
2506         }
2507         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2508         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2509         flags = texture->flags;
2510         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2511         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2512         tcscale = _mm_cvtepi32_ps(tcsize);
2513         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2514         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2515         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2516         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2517         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2518         tcmax = _mm_packs_epi32(tcmask, tcmask);
2519         for (x = startx;x < endx;)
2520         {
2521                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2522                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2523                 if (nextsub >= endx)
2524                 {
2525                         nextsub = endsub = endx-1;
2526                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2527                 }       
2528                 tc = endtc;
2529                 subtc = endsubtc;
2530                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2531                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2532                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2533                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2534                 substep = _mm_slli_epi32(substep, 1);
2535                 if (filter)
2536                 {
2537                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2538                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2539                         {
2540                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2541                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2542                                 {
2543                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2544                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2545                                         tci = _mm_madd_epi16(tci, tcoffset);
2546                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2547                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2548                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2549                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2550                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2551                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2552                                         fracm = _mm_srli_epi16(subtc, 1);
2553                                         pix1 = _mm_add_epi16(pix1,
2554                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556                                         pix3 = _mm_add_epi16(pix3,
2557                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2558                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2559                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2560                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2561                                         pix2 = _mm_add_epi16(pix2,
2562                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2563                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2564                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2565                                 }
2566                                 if (x <= endsub)
2567                                 {
2568                                         const unsigned char * RESTRICT ptr1;
2569                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2570                                         tci = _mm_madd_epi16(tci, tcoffset);
2571                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2572                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574                                         fracm = _mm_srli_epi16(subtc, 1);
2575                                         pix1 = _mm_add_epi16(pix1,
2576                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2577                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2578                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2579                                         pix1 = _mm_add_epi16(pix1,
2580                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2581                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2582                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2583                                         x++;
2584                                 }
2585                         }
2586                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2587                         {
2588                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2589                                 {
2590                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2591                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2592                                         tci = _mm_madd_epi16(tci, tcoffset);
2593                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2594                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2595                                                                                         _mm_setzero_si128());
2596                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2597                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2598                                                                                         _mm_setzero_si128());
2599                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2600                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2601                                         tci = _mm_madd_epi16(tci, tcoffset);
2602                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2603                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2604                                                                                         _mm_setzero_si128());
2605                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2606                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2607                                                                                         _mm_setzero_si128());
2608                                         fracm = _mm_srli_epi16(subtc, 1);
2609                                         pix1 = _mm_add_epi16(pix1,
2610                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2612                                         pix3 = _mm_add_epi16(pix3,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2614                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2615                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2616                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2617                                         pix2 = _mm_add_epi16(pix2,
2618                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2619                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2620                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2621                                 }
2622                                 if (x <= endsub)
2623                                 {
2624                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2625                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2626                                         tci = _mm_madd_epi16(tci, tcoffset);
2627                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2628                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2629                                                                                         _mm_setzero_si128());
2630                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2632                                                                                         _mm_setzero_si128());
2633                                         fracm = _mm_srli_epi16(subtc, 1);
2634                                         pix1 = _mm_add_epi16(pix1,
2635                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2636                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2637                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2638                                         pix1 = _mm_add_epi16(pix1,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2642                                         x++;
2643                                 }
2644                         }
2645                         else
2646                         {
2647                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2648                                 {
2649                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2650                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2651                                         tci = _mm_madd_epi16(tci, tcoffset);
2652                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2653                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2654                                                                                         _mm_setzero_si128());
2655                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2656                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2657                                                                                         _mm_setzero_si128());
2658                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2659                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2660                                         tci = _mm_madd_epi16(tci, tcoffset);
2661                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2662                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2663                                                                                         _mm_setzero_si128());
2664                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2665                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2666                                                                                         _mm_setzero_si128());
2667                                         fracm = _mm_srli_epi16(subtc, 1);
2668                                         pix1 = _mm_add_epi16(pix1,
2669                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2671                                         pix3 = _mm_add_epi16(pix3,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2673                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2674                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2675                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2676                                         pix2 = _mm_add_epi16(pix2,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2678                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2679                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2680                                 }
2681                                 if (x <= endsub)
2682                                 {
2683                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2684                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2685                                         tci = _mm_madd_epi16(tci, tcoffset);
2686                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2687                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2688                                                                                         _mm_setzero_si128());
2689                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2690                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2691                                                                                         _mm_setzero_si128());
2692                                         fracm = _mm_srli_epi16(subtc, 1);
2693                                         pix1 = _mm_add_epi16(pix1,
2694                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2695                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2696                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2697                                         pix1 = _mm_add_epi16(pix1,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2700                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2701                                         x++;
2702                                 }
2703                         }
2704                 }
2705                 else
2706                 {
2707                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2708                         {
2709                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2710                                 {
2711                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2713                                         tci = _mm_madd_epi16(tci, tcoffset);
2714                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2716                                 }
2717                                 if (x <= endsub)
2718                                 {
2719                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         x++;
2724                                 }
2725                         }
2726                         else
2727                         {
2728                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2729                                 {
2730                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2731                                         tci = _mm_and_si128(tci, tcmax); 
2732                                         tci = _mm_madd_epi16(tci, tcoffset);
2733                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2734                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2735                                 }
2736                                 if (x <= endsub)
2737                                 {
2738                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2739                                         tci = _mm_and_si128(tci, tcmax); 
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742                                         x++;
2743                                 }
2744                         }
2745                 }
2746         }
2747 #endif
2748 }
2749
2750 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2751 {
2752         // TODO: IMPLEMENT
2753         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2754 }
2755
2756 float DPSOFTRAST_SampleShadowmap(const float *vector)
2757 {
2758         // TODO: IMPLEMENT
2759         return 1.0f;
2760 }
2761
2762 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2763 {
2764         int x;
2765         int startx = span->startx;
2766         int endx = span->endx;
2767         float c[4];
2768         float data[4];
2769         float slope[4];
2770         float z;
2771         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2772         for (x = startx;x < endx;x++)
2773         {
2774                 z = zf[x];
2775                 c[0] = (data[0] + slope[0]*x) * z;
2776                 c[1] = (data[1] + slope[1]*x) * z;
2777                 c[2] = (data[2] + slope[2]*x) * z;
2778                 c[3] = (data[3] + slope[3]*x) * z;
2779                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2780                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2781                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2782                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2783         }
2784 }
2785
2786 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2787 {
2788         int x;
2789         int startx = span->startx;
2790         int endx = span->endx;
2791         float c[4];
2792         float data[4];
2793         float slope[4];
2794         float z;
2795         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796         for (x = startx;x < endx;x++)
2797         {
2798                 z = zf[x];
2799                 c[0] = (data[0] + slope[0]*x) * z;
2800                 c[1] = (data[1] + slope[1]*x) * z;
2801                 c[2] = (data[2] + slope[2]*x) * z;
2802                 c[3] = (data[3] + slope[3]*x) * z;
2803                 out4f[x*4+0] = c[0];
2804                 out4f[x*4+1] = c[1];
2805                 out4f[x*4+2] = c[2];
2806                 out4f[x*4+3] = c[3];
2807         }
2808 }
2809
2810 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2811 {
2812         int x, startx = span->startx, endx = span->endx;
2813         float c[4], localcolor[4];
2814         localcolor[0] = subcolor[0];
2815         localcolor[1] = subcolor[1];
2816         localcolor[2] = subcolor[2];
2817         localcolor[3] = subcolor[3];
2818         for (x = startx;x < endx;x++)
2819         {
2820                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2821                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2822                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2823                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2824                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2825                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2826                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2827                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2828         }
2829 }
2830
2831 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2832 {
2833         int x, startx = span->startx, endx = span->endx;
2834         for (x = startx;x < endx;x++)
2835         {
2836                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2837                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2838                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2839                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2840         }
2841 }
2842
2843 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2844 {
2845         int x, startx = span->startx, endx = span->endx;
2846         for (x = startx;x < endx;x++)
2847         {
2848                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2849                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2850                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2851                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2852         }
2853 }
2854
2855 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2856 {
2857         int x, startx = span->startx, endx = span->endx;
2858         float a, b;
2859         for (x = startx;x < endx;x++)
2860         {
2861                 a = 1.0f - inb4f[x*4+3];
2862                 b = inb4f[x*4+3];
2863                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2864                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2865                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2866                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2867         }
2868 }
2869
2870 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2871 {
2872         int x, startx = span->startx, endx = span->endx;
2873         float localcolor[4], ilerp, lerp;
2874         localcolor[0] = color[0];
2875         localcolor[1] = color[1];
2876         localcolor[2] = color[2];
2877         localcolor[3] = color[3];
2878         ilerp = 1.0f - localcolor[3];
2879         lerp = localcolor[3];
2880         for (x = startx;x < endx;x++)
2881         {
2882                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2883                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2884                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2885                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2886         }
2887 }
2888
2889
2890
2891 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2892 {
2893 #ifdef SSE2_PRESENT
2894         int x;
2895         int startx = span->startx;
2896         int endx = span->endx;
2897         __m128 data, slope;
2898         __m128 mod, endmod;
2899         __m128i submod, substep, endsubmod;
2900         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2901         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2902         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2903         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2904         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2905         for (x = startx; x < endx;)
2906         {
2907                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2908                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2909                 if (nextsub >= endx)
2910                 {
2911                         nextsub = endsub = endx-1;
2912                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2913                 }
2914                 mod = endmod;
2915                 submod = endsubmod;
2916                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2917                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2918                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2919                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2920                 substep = _mm_packs_epi32(substep, substep);
2921                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2922                 {
2923                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2924                         pix = _mm_mulhi_epu16(pix, submod);
2925                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2926                 }
2927                 if (x <= endsub)
2928                 {
2929                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2930                         pix = _mm_mulhi_epu16(pix, submod);
2931                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2932                         x++;
2933                 }
2934         }
2935 #endif
2936 }
2937
2938 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2939 {
2940 #ifdef SSE2_PRESENT
2941         int x;
2942         int startx = span->startx;
2943         int endx = span->endx;
2944         __m128 data, slope;
2945         __m128 mod, endmod;
2946         __m128i submod, substep, endsubmod;
2947         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2948         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2949         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2950         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2951         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2952         for (x = startx; x < endx;)
2953         {
2954                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2955                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2956                 if (nextsub >= endx)
2957                 {
2958                         nextsub = endsub = endx-1;
2959                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2960                 }
2961                 mod = endmod;
2962                 submod = endsubmod;
2963                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2964                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2965                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2966                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2967                 substep = _mm_packs_epi32(substep, substep);
2968                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2969                 {
2970                         __m128i pix = _mm_srai_epi16(submod, 4);
2971                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2972                 }
2973                 if (x <= endsub)
2974                 {
2975                         __m128i pix = _mm_srai_epi16(submod, 4);
2976                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2977                         x++;
2978                 }
2979         }
2980 #endif
2981 }
2982
2983 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2984 {
2985 #ifdef SSE2_PRESENT
2986         int x, startx = span->startx, endx = span->endx;
2987         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2988         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2989         for (x = startx;x+2 <= endx;x+=2)
2990         {
2991                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2992                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2993                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2994                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2995         }
2996         if (x < endx)
2997         {
2998                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2999                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3000                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3002         }
3003 #endif
3004 }
3005
3006 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3007 {
3008 #ifdef SSE2_PRESENT
3009         int x, startx = span->startx, endx = span->endx;
3010         for (x = startx;x+2 <= endx;x+=2)
3011         {
3012                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3014                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3016         }
3017         if (x < endx)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3021                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3023         }
3024 #endif
3025 }
3026
3027 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3028 {
3029 #ifdef SSE2_PRESENT
3030         int x, startx = span->startx, endx = span->endx;
3031         for (x = startx;x+2 <= endx;x+=2)
3032         {
3033                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3035                 pix1 = _mm_add_epi16(pix1, pix2);
3036                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3037         }
3038         if (x < endx)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3042                 pix1 = _mm_add_epi16(pix1, pix2);
3043                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3044         }
3045 #endif
3046 }
3047
3048 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3049 {
3050 #ifdef SSE2_PRESENT
3051         int x, startx = span->startx, endx = span->endx;
3052         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3053         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3054         for (x = startx;x+2 <= endx;x+=2)
3055         {
3056                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3057                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3058                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3060         }
3061         if (x < endx)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3065                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3067         }
3068 #endif
3069 }
3070
3071 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3072 {
3073 #ifdef SSE2_PRESENT
3074         int x, startx = span->startx, endx = span->endx;
3075         for (x = startx;x+2 <= endx;x+=2)
3076         {
3077                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3079                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3080                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3081                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3082         }
3083         if (x < endx)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3088                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3090         }
3091 #endif
3092 }
3093
3094 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3095 {
3096 #ifdef SSE2_PRESENT
3097         int x, startx = span->startx, endx = span->endx;
3098         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3099         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3100         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3101         for (x = startx;x+2 <= endx;x+=2)
3102         {
3103                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3104                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3105                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3106         }
3107         if (x < endx)
3108         {
3109                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3110                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3111                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3112         }
3113 #endif
3114 }
3115
3116
3117
3118 void DPSOFTRAST_VertexShader_Generic(void)
3119 {
3120         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3121         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3122         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3123         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3124                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3125 }
3126
3127 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3128 {
3129         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3130         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3134         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3135         {
3136                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3137                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3138                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3139                 {
3140                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3141                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3142                         {
3143                                 // multiply
3144                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3145                         }
3146                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3147                         {
3148                                 // add
3149                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3150                         }
3151                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3152                         {
3153                                 // alphablend
3154                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3155                         }
3156                 }
3157         }
3158         else
3159                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3160         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3161 }
3162
3163
3164
3165 void DPSOFTRAST_VertexShader_PostProcess(void)
3166 {
3167         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3169         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3170 }
3171
3172 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3173 {
3174         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3175         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3176         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3179         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3180         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3181         {
3182                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3183                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3184         }
3185         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3186         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3187         {
3188                 // TODO: implement saturation
3189         }
3190         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3191         {
3192                 // TODO: implement gammaramps
3193         }
3194         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3195 }
3196
3197
3198
3199 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3200 {
3201         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3202 }
3203
3204 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3205 {
3206         // this is never called (because colormask is off when this shader is used)
3207         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3208         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3210         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3211         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3212 }
3213
3214
3215
3216 void DPSOFTRAST_VertexShader_FlatColor(void)
3217 {
3218         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3220 }
3221
3222 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 {
3224 #ifdef SSE2_PRESENT
3225         unsigned char * RESTRICT pixelmask = span->pixelmask;
3226         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3227         int x, startx = span->startx, endx = span->endx;
3228         __m128i Color_Ambientm;
3229         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3230         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3231         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3235                 pixel = buffer_FragColorbgra8;
3236         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3237         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3238         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3239         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3240         for (x = startx;x < endx;x++)
3241         {
3242                 __m128i color, pix;
3243                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3244                 {
3245                         __m128i pix2;
3246                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3247                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3248                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3249                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3250                         x += 3;
3251                         continue;
3252                 }
3253                 if (!pixelmask[x])
3254                         continue;
3255                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3256                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3257                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3258         }
3259         if (pixel == buffer_FragColorbgra8)
3260                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3261 #endif
3262 }
3263
3264
3265
3266 void DPSOFTRAST_VertexShader_VertexColor(void)
3267 {
3268         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3269         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3270         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3271 }
3272
3273 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 {
3275 #ifdef SSE2_PRESENT
3276         unsigned char * RESTRICT pixelmask = span->pixelmask;
3277         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278         int x, startx = span->startx, endx = span->endx;
3279         __m128i Color_Ambientm, Color_Diffusem;
3280         __m128 data, slope;
3281         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3285         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3287         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3288                 pixel = buffer_FragColorbgra8;
3289         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3290         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3291         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3292         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3293         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3294         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3295         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3296         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3297         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3298         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3299         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3300         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3301         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3302         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3303         {
3304                 __m128i color, mod, pix;
3305                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3306                 {
3307                         __m128i pix2, mod2;
3308                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3309                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3310                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3311                         data = _mm_add_ps(data, slope);
3312                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3313                         data = _mm_add_ps(data, slope);
3314                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3315                         data = _mm_add_ps(data, slope);
3316                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3317                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3318                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3319                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3320                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3321                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3322                         x += 3;
3323                         continue;
3324                 }
3325                 if (!pixelmask[x])
3326                         continue;
3327                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3328                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3329                 mod = _mm_packs_epi32(mod, mod);
3330                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3331                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3332         }
3333         if (pixel == buffer_FragColorbgra8)
3334                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3335 #endif
3336 }
3337
3338
3339
3340 void DPSOFTRAST_VertexShader_Lightmap(void)
3341 {
3342         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3343         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3344         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3345 }
3346
3347 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3348 {
3349 #ifdef SSE2_PRESENT
3350         unsigned char * RESTRICT pixelmask = span->pixelmask;
3351         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352         int x, startx = span->startx, endx = span->endx;
3353         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3354         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3360         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3361         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3362         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363                 pixel = buffer_FragColorbgra8;
3364         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3372         {
3373                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3375                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3376                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3377                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3378                 for (x = startx;x < endx;x++)
3379                 {
3380                         __m128i color, lightmap, glow, pix;
3381                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3382                         {
3383                                 __m128i pix2;
3384                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3386                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3387                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3388                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3389                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3390                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3391                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3392                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3393                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3394                                 x += 3;
3395                                 continue;
3396                         }
3397                         if (!pixelmask[x])
3398                                 continue;
3399                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3400