fix an unused variable
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int x;
1154         int y;
1155         unsigned char *inpixels;
1156         unsigned char *b;
1157         unsigned char *o;
1158         DPSOFTRAST_Flush();
1159         if (bx1 < 0) bx1 = 0;
1160         if (by1 < 0) by1 = 0;
1161         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1162         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1163         bw = bx2 - bx1;
1164         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1165         if (dpsoftrast.bigendian)
1166         {
1167                 for (y = by1;y < by2;y++)
1168                 {
1169                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1170                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1171                         for (x = bx1;x < bx2;x++)
1172                         {
1173                                 o[0] = b[3];
1174                                 o[1] = b[2];
1175                                 o[2] = b[1];
1176                                 o[3] = b[0];
1177                                 o += 4;
1178                                 b += 4;
1179                         }
1180                 }
1181         }
1182         else
1183         {
1184                 for (y = by1;y < by2;y++)
1185                 {
1186                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1187                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1188                         memcpy(o, b, bw*4);
1189                 }
1190         }
1191
1192 }
1193 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1194 {
1195         int tx1 = tx;
1196         int ty1 = ty;
1197         int tx2 = tx + width;
1198         int ty2 = ty + height;
1199         int sx1 = sx;
1200         int sy1 = sy;
1201         int sx2 = sx + width;
1202         int sy2 = sy + height;
1203         int swidth;
1204         int sheight;
1205         int twidth;
1206         int theight;
1207         int sw;
1208         int sh;
1209         int tw;
1210         int th;
1211         int y;
1212         unsigned int *spixels;
1213         unsigned int *tpixels;
1214         DPSOFTRAST_Texture *texture;
1215         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1216         if (mip < 0 || mip >= texture->mipmaps) return;
1217         DPSOFTRAST_Flush();
1218         spixels = dpsoftrast.fb_colorpixels[0];
1219         swidth = dpsoftrast.fb_width;
1220         sheight = dpsoftrast.fb_height;
1221         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1222         twidth = texture->mipmap[mip][2];
1223         theight = texture->mipmap[mip][3];
1224         if (tx1 < 0) tx1 = 0;
1225         if (ty1 < 0) ty1 = 0;
1226         if (tx2 > twidth) tx2 = twidth;
1227         if (ty2 > theight) ty2 = theight;
1228         if (sx1 < 0) sx1 = 0;
1229         if (sy1 < 0) sy1 = 0;
1230         if (sx2 > swidth) sx2 = swidth;
1231         if (sy2 > sheight) sy2 = sheight;
1232         tw = tx2 - tx1;
1233         th = ty2 - ty1;
1234         sw = sx2 - sx1;
1235         sh = sy2 - sy1;
1236         if (tw > sw) tw = sw;
1237         if (th > sh) th = sh;
1238         if (tw < 1 || th < 1)
1239                 return;
1240         for (y = 0;y < th;y++)
1241                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1242         if (texture->mipmaps > 1)
1243                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1244 }
1245
1246 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1247 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1248 {
1249         if (thread->texbound[command->unitnum])
1250                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1251         thread->texbound[command->unitnum] = command->texture;
1252 }
1253 void DPSOFTRAST_SetTexture(int unitnum, int index)
1254 {
1255         DPSOFTRAST_Command_SetTexture *command;
1256         DPSOFTRAST_Texture *texture;
1257         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1258         {
1259                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1260                 return;
1261         }
1262         texture = DPSOFTRAST_Texture_GetByIndex(index);
1263         if (index && !texture)
1264         {
1265                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1266                 return;
1267         }
1268
1269         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1270         command->unitnum = unitnum;
1271         command->texture = texture;
1272
1273         dpsoftrast.texbound[unitnum] = texture;
1274         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1275 }
1276
1277 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1278 {
1279         dpsoftrast.pointer_vertex3f = vertex3f;
1280         dpsoftrast.stride_vertex = stride;
1281 }
1282 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1283 {
1284         dpsoftrast.pointer_color4f = color4f;
1285         dpsoftrast.pointer_color4ub = NULL;
1286         dpsoftrast.stride_color = stride;
1287 }
1288 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1289 {
1290         dpsoftrast.pointer_color4f = NULL;
1291         dpsoftrast.pointer_color4ub = color4ub;
1292         dpsoftrast.stride_color = stride;
1293 }
1294 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1295 {
1296         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1297         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1298         dpsoftrast.stride_texcoord[unitnum] = stride;
1299 }
1300
1301 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1302 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1303 {
1304         thread->shader_mode = command->mode;
1305         thread->shader_permutation = command->permutation;
1306 }
1307 void DPSOFTRAST_SetShader(int mode, int permutation)
1308 {
1309         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1310         command->mode = mode;
1311         command->permutation = permutation;
1312
1313         dpsoftrast.shader_mode = mode;
1314         dpsoftrast.shader_permutation = permutation;
1315 }
1316
1317 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1318 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1319 {
1320         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1321 }
1322 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1323 {
1324         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1325         command->index = index;
1326         command->val[0] = v0;
1327         command->val[1] = v1;
1328         command->val[2] = v2;
1329         command->val[3] = v3;
1330
1331         dpsoftrast.uniform4f[index*4+0] = v0;
1332         dpsoftrast.uniform4f[index*4+1] = v1;
1333         dpsoftrast.uniform4f[index*4+2] = v2;
1334         dpsoftrast.uniform4f[index*4+3] = v3;
1335 }
1336 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1337 {
1338         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1339         command->index = index;
1340         memcpy(command->val, v, sizeof(command->val));
1341
1342         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1343 }
1344
1345 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1346 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1347 {
1348         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 }
1350 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1351 {
1352 #ifdef SSE2_PRESENT
1353         int i, index;
1354         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1355         {
1356                 __m128 m0, m1, m2, m3;
1357                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1358                 command->index = (DPSOFTRAST_UNIFORM)index;
1359                 if (((size_t)v)&(ALIGN_SIZE-1))
1360                 {
1361                         m0 = _mm_loadu_ps(v);
1362                         m1 = _mm_loadu_ps(v+4);
1363                         m2 = _mm_loadu_ps(v+8);
1364                         m3 = _mm_loadu_ps(v+12);
1365                 }
1366                 else
1367                 {
1368                         m0 = _mm_load_ps(v);
1369                         m1 = _mm_load_ps(v+4);
1370                         m2 = _mm_load_ps(v+8);
1371                         m3 = _mm_load_ps(v+12);
1372                 }
1373                 if (transpose)
1374                 {
1375                         __m128 t0, t1, t2, t3;
1376                         t0 = _mm_unpacklo_ps(m0, m1);
1377                         t1 = _mm_unpacklo_ps(m2, m3);
1378                         t2 = _mm_unpackhi_ps(m0, m1);
1379                         t3 = _mm_unpackhi_ps(m2, m3);
1380                         m0 = _mm_movelh_ps(t0, t1);
1381                         m1 = _mm_movehl_ps(t1, t0);
1382                         m2 = _mm_movelh_ps(t2, t3);
1383                         m3 = _mm_movehl_ps(t3, t2);                     
1384                 }
1385                 _mm_store_ps(command->val, m0);
1386                 _mm_store_ps(command->val+4, m1);
1387                 _mm_store_ps(command->val+8, m2);
1388                 _mm_store_ps(command->val+12, m3);
1389                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1390                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1391                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1393         }
1394 #endif
1395 }
1396
1397 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1398 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1399 {
1400         thread->uniform1i[command->index] = command->val;
1401 }
1402 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1403 {
1404         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1405         command->index = index;
1406         command->val = i0;
1407
1408         dpsoftrast.uniform1i[command->index] = i0;
1409 }
1410
1411 #ifdef SSE2_PRESENT
1412 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1413 {
1414         float *end = dst + size*4;
1415         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1416         {
1417                 while (dst < end)
1418                 {
1419                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1420                         dst += 4;
1421                         src += stride;
1422                 }
1423         }
1424         else
1425         {
1426                 while (dst < end)
1427                 {
1428                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1429                         dst += 4;
1430                         src += stride;
1431                 }
1432         }
1433 }
1434
1435 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if (stride == sizeof(float[3]))
1439         {
1440                 float *end4 = dst + (size&~3)*4;        
1441                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1442                 {
1443                         while (dst < end4)
1444                         {
1445                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1446                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1447                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1453                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1454                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1455                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1456                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dst += 16;
1459                                 src += 4*sizeof(float[3]);
1460                         }
1461                 }
1462                 else
1463                 {
1464                         while (dst < end4)
1465                         {
1466                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1467                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1468                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1474                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1475                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1476                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dst += 16;
1480                                 src += 4*sizeof(float[3]);
1481                         }
1482                 }
1483         }
1484         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1485         {
1486                 while (dst < end)
1487                 {
1488                         __m128 v = _mm_loadu_ps((const float *)src);
1489                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1490                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1491                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1492                         _mm_store_ps(dst, v);
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497         else
1498         {
1499                 while (dst < end)
1500                 {
1501                         __m128 v = _mm_load_ps((const float *)src);
1502                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1503                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1505                         _mm_store_ps(dst, v);
1506                         dst += 4;
1507                         src += stride;
1508                 }
1509         }
1510 }
1511
1512 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1513 {
1514         float *end = dst + size*4;
1515         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1516         if (stride == sizeof(float[2]))
1517         {
1518                 float *end2 = dst + (size&~1)*4;
1519                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1520                 {
1521                         while (dst < end2)
1522                         {
1523                                 __m128 v = _mm_loadu_ps((const float *)src);
1524                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1525                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1526                                 dst += 8;
1527                                 src += 2*sizeof(float[2]);
1528                         }
1529                 }
1530                 else
1531                 {
1532                         while (dst < end2)
1533                         {
1534                                 __m128 v = _mm_load_ps((const float *)src);
1535                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1536                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537                                 dst += 8;
1538                                 src += 2*sizeof(float[2]);
1539                         }
1540                 }
1541         }
1542         while (dst < end)
1543         {
1544                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1545                 dst += 4;
1546                 src += stride;
1547         }
1548 }
1549
1550 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1551 {
1552         float *end = dst + size*4;
1553         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1554         if (stride == sizeof(unsigned char[4]))
1555         {
1556                 float *end4 = dst + (size&~3)*4;
1557                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1558                 {
1559                         while (dst < end4)
1560                         {
1561                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1562                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1563                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1564                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1565                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1566                                 dst += 16;
1567                                 src += 4*sizeof(unsigned char[4]);
1568                         }
1569                 }
1570                 else
1571                 {
1572                         while (dst < end4)
1573                         {
1574                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1575                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1576                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1577                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1579                                 dst += 16;
1580                                 src += 4*sizeof(unsigned char[4]);
1581                         }
1582                 }
1583         }
1584         while (dst < end)
1585         {
1586                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1587                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1588                 dst += 4;
1589                 src += stride;
1590         }
1591 }
1592
1593 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1594 {
1595         float *end = dst + 4*size;
1596         __m128 v = _mm_loadu_ps(src);
1597         while (dst < end)
1598         {
1599                 _mm_store_ps(dst, v);
1600                 dst += 4;
1601         }
1602 }
1603 #endif
1604
1605 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1606 {
1607 #ifdef SSE2_PRESENT
1608         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1609         __m128 m0, m1, m2, m3;
1610         float *end;
1611         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1612         {
1613                 // fast case for identity matrix
1614                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1615                 return;
1616         }
1617         end = out4f + numitems*4;
1618         m0 = _mm_loadu_ps(inmatrix16f);
1619         m1 = _mm_loadu_ps(inmatrix16f + 4);
1620         m2 = _mm_loadu_ps(inmatrix16f + 8);
1621         m3 = _mm_loadu_ps(inmatrix16f + 12);
1622         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1623         {
1624                 while (out4f < end)
1625                 {
1626                         __m128 v = _mm_loadu_ps(in4f);
1627                         _mm_store_ps(out4f,
1628                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1629                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1630                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1631                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1632                         out4f += 4;
1633                         in4f += 4;
1634                 }
1635         }
1636         else
1637         {
1638                 while (out4f < end)
1639                 {
1640                         __m128 v = _mm_load_ps(in4f);
1641                         _mm_store_ps(out4f,
1642                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1643                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1644                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1645                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1646                         out4f += 4;
1647                         in4f += 4;
1648                 }
1649         }
1650 #endif
1651 }
1652
1653 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1654 {
1655         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1656 }
1657
1658 #ifdef SSE2_PRESENT
1659 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1660 { \
1661         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1662         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1663         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1664         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1665 }
1666
1667 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1668 { \
1669         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1670         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1671         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1672         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1673 }
1674
1675 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1676 { \
1677         __m128 p = (in); \
1678         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1679                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1680                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1681                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1682 }
1683
1684 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1685 {
1686         int clipmask = 0xFF;
1687         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1688         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1689         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1690         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1691         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1692         #define BBFRONT(k, pos) \
1693         { \
1694                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1695                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1697                 { \
1698                         __m128 proj; \
1699                         clipmask &= ~(1<<k); \
1700                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1701                         minproj = _mm_min_ss(minproj, proj); \
1702                         maxproj = _mm_max_ss(maxproj, proj); \
1703                 } \
1704         }
1705         BBFRONT(0, minpos); 
1706         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1707         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1708         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1709         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1710         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1712         BBFRONT(7, maxpos);
1713         #define BBCLIP(k) \
1714         { \
1715                 if (clipmask&(1<<k)) \
1716                 { \
1717                         if (!(clipmask&(1<<(k^1)))) \
1718                         { \
1719                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1720                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1721                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1722                                 minproj = _mm_min_ss(minproj, proj); \
1723                                 maxproj = _mm_max_ss(maxproj, proj); \
1724                         } \
1725                         if (!(clipmask&(1<<(k^2)))) \
1726                         { \
1727                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1728                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1729                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1730                                 minproj = _mm_min_ss(minproj, proj); \
1731                                 maxproj = _mm_max_ss(maxproj, proj); \
1732                         } \
1733                         if (!(clipmask&(1<<(k^4)))) \
1734                         { \
1735                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1736                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1737                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1738                                 minproj = _mm_min_ss(minproj, proj); \
1739                                 maxproj = _mm_max_ss(maxproj, proj); \
1740                         } \
1741                 } \
1742         }
1743         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1744         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1745         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1746         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1747         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1748         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1749         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1750         *starty = _mm_cvttss_si32(maxproj);
1751         *endy = _mm_cvttss_si32(minproj)+1;
1752         return clipmask;
1753 }
1754         
1755 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1756 {
1757         float *end = out4f + numitems*4;
1758         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1759         __m128 minpos, maxpos;
1760         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1761         {
1762                 minpos = maxpos = _mm_loadu_ps(in4f);
1763                 while (out4f < end)
1764                 {
1765                         __m128 v = _mm_loadu_ps(in4f);
1766                         minpos = _mm_min_ps(minpos, v);
1767                         maxpos = _mm_max_ps(maxpos, v);
1768                         _mm_store_ps(out4f, v);
1769                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1770                         _mm_store_ps(screen4f, v);
1771                         in4f += 4;
1772                         out4f += 4;
1773                         screen4f += 4;
1774                 }
1775         }
1776         else
1777         {
1778                 minpos = maxpos = _mm_load_ps(in4f);
1779                 while (out4f < end)
1780                 {
1781                         __m128 v = _mm_load_ps(in4f);
1782                         minpos = _mm_min_ps(minpos, v);
1783                         maxpos = _mm_max_ps(maxpos, v);
1784                         _mm_store_ps(out4f, v);
1785                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1786                         _mm_store_ps(screen4f, v);
1787                         in4f += 4;
1788                         out4f += 4;
1789                         screen4f += 4;
1790                 }
1791         }
1792         if (starty && endy) 
1793                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1794                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1795                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1796                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1797                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1798         return 0;
1799 }
1800
1801 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1802 {
1803         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805         float *end;
1806         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808         end = out4f + numitems*4;
1809         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811         m0 = _mm_loadu_ps(inmatrix16f);
1812         m1 = _mm_loadu_ps(inmatrix16f + 4);
1813         m2 = _mm_loadu_ps(inmatrix16f + 8);
1814         m3 = _mm_loadu_ps(inmatrix16f + 12);
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824                         _mm_store_ps(out4f, v);
1825                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826                         _mm_store_ps(screen4f, v);
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832         else
1833         {
1834                 minpos = maxpos = _mm_load_ps(in4f);
1835                 while (out4f < end)
1836                 {
1837                         __m128 v = _mm_load_ps(in4f);
1838                         minpos = _mm_min_ps(minpos, v);
1839                         maxpos = _mm_max_ps(maxpos, v);
1840                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1851         return 0;
1852 }
1853 #endif
1854
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 {
1857 #ifdef SSE2_PRESENT
1858         float *outf = dpsoftrast.post_array4f[outarray];
1859         const unsigned char *inb;
1860         int firstvertex = dpsoftrast.firstvertex;
1861         int numvertices = dpsoftrast.numvertices;
1862         int stride;
1863         switch(inarray)
1864         {
1865         case DPSOFTRAST_ARRAY_POSITION:
1866                 stride = dpsoftrast.stride_vertex;
1867                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1868                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1869                 break;
1870         case DPSOFTRAST_ARRAY_COLOR:
1871                 stride = dpsoftrast.stride_color;
1872                 if (dpsoftrast.pointer_color4f)
1873                 {
1874                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1875                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1876                 }
1877                 else if (dpsoftrast.pointer_color4ub)
1878                 {
1879                         stride = dpsoftrast.stride_color;
1880                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1881                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1882                 }
1883                 else
1884                 {
1885                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1886                 }
1887                 break;
1888         default:
1889                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1890                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1891                 {
1892                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1893                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894                         {
1895                         case 2:
1896                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1897                                 break;
1898                         case 3:
1899                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1900                                 break;
1901                         case 4:
1902                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1903                                 break;
1904                         }
1905                 }
1906                 break;
1907         }
1908         return outf;
1909 #else
1910         return NULL;
1911 #endif
1912 }
1913
1914 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1915 {
1916         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1917         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1918         return data;
1919 }
1920
1921 #if 0
1922 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1923 {
1924 #ifdef SSE2_PRESENT
1925         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1927         return data;
1928 #else
1929         return NULL;
1930 #endif
1931 }
1932 #endif
1933
1934 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1935 {
1936 #ifdef SSE2_PRESENT
1937         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1938         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1939         return data;
1940 #else
1941         return NULL;
1942 #endif
1943 }
1944
1945 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1946 {
1947         int x;
1948         int startx = span->startx;
1949         int endx = span->endx;
1950         float wslope = triangle->w[0];
1951         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1952         float endz = 1.0f / (w + wslope * startx);
1953         for (x = startx;x < endx;)
1954         {
1955                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1956                 float z = endz, dz;
1957                 if (nextsub >= endx) nextsub = endsub = endx-1;
1958                 endz = 1.0f / (w + wslope * nextsub);
1959                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1960                 for (; x <= endsub; x++, z += dz)
1961                         zf[x] = z;
1962         }
1963 }
1964
1965 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1966 {
1967         int x;
1968         int startx = span->startx;
1969         int endx = span->endx;
1970         int d[4];
1971         float a, b;
1972         unsigned char * RESTRICT pixelmask = span->pixelmask;
1973         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1974         if (!pixel)
1975                 return;
1976         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1977         // handle alphatest now (this affects depth writes too)
1978         if (thread->alphatest)
1979                 for (x = startx;x < endx;x++)
1980                         if (in4f[x*4+3] < 0.5f)
1981                                 pixelmask[x] = false;
1982         // FIXME: this does not handle bigendian
1983         switch(thread->fb_blendmode)
1984         {
1985         case DPSOFTRAST_BLENDMODE_OPAQUE:
1986                 for (x = startx;x < endx;x++)
1987                 {
1988                         if (!pixelmask[x])
1989                                 continue;
1990                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1991                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1992                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1993                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1994                         pixel[x*4+0] = d[0];
1995                         pixel[x*4+1] = d[1];
1996                         pixel[x*4+2] = d[2];
1997                         pixel[x*4+3] = d[3];
1998                 }
1999                 break;
2000         case DPSOFTRAST_BLENDMODE_ALPHA:
2001                 for (x = startx;x < endx;x++)
2002                 {
2003                         if (!pixelmask[x])
2004                                 continue;
2005                         a = in4f[x*4+3] * 255.0f;
2006                         b = 1.0f - in4f[x*4+3];
2007                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2008                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2009                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2010                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2011                         pixel[x*4+0] = d[0];
2012                         pixel[x*4+1] = d[1];
2013                         pixel[x*4+2] = d[2];
2014                         pixel[x*4+3] = d[3];
2015                 }
2016                 break;
2017         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2018                 for (x = startx;x < endx;x++)
2019                 {
2020                         if (!pixelmask[x])
2021                                 continue;
2022                         a = in4f[x*4+3] * 255.0f;
2023                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027                         pixel[x*4+0] = d[0];
2028                         pixel[x*4+1] = d[1];
2029                         pixel[x*4+2] = d[2];
2030                         pixel[x*4+3] = d[3];
2031                 }
2032                 break;
2033         case DPSOFTRAST_BLENDMODE_ADD:
2034                 for (x = startx;x < endx;x++)
2035                 {
2036                         if (!pixelmask[x])
2037                                 continue;
2038                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042                         pixel[x*4+0] = d[0];
2043                         pixel[x*4+1] = d[1];
2044                         pixel[x*4+2] = d[2];
2045                         pixel[x*4+3] = d[3];
2046                 }
2047                 break;
2048         case DPSOFTRAST_BLENDMODE_INVMOD:
2049                 for (x = startx;x < endx;x++)
2050                 {
2051                         if (!pixelmask[x])
2052                                 continue;
2053                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_MUL:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072                         pixel[x*4+0] = d[0];
2073                         pixel[x*4+1] = d[1];
2074                         pixel[x*4+2] = d[2];
2075                         pixel[x*4+3] = d[3];
2076                 }
2077                 break;
2078         case DPSOFTRAST_BLENDMODE_MUL2:
2079                 for (x = startx;x < endx;x++)
2080                 {
2081                         if (!pixelmask[x])
2082                                 continue;
2083                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2084                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2085                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2086                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2087                         pixel[x*4+0] = d[0];
2088                         pixel[x*4+1] = d[1];
2089                         pixel[x*4+2] = d[2];
2090                         pixel[x*4+3] = d[3];
2091                 }
2092                 break;
2093         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2094                 for (x = startx;x < endx;x++)
2095                 {
2096                         if (!pixelmask[x])
2097                                 continue;
2098                         a = in4f[x*4+3] * -255.0f;
2099                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2100                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2101                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2102                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2103                         pixel[x*4+0] = d[0];
2104                         pixel[x*4+1] = d[1];
2105                         pixel[x*4+2] = d[2];
2106                         pixel[x*4+3] = d[3];
2107                 }
2108                 break;
2109         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2110                 for (x = startx;x < endx;x++)
2111                 {
2112                         if (!pixelmask[x])
2113                                 continue;
2114                         a = 255.0f;
2115                         b = 1.0f - in4f[x*4+3];
2116                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2117                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2118                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2119                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2120                         pixel[x*4+0] = d[0];
2121                         pixel[x*4+1] = d[1];
2122                         pixel[x*4+2] = d[2];
2123                         pixel[x*4+3] = d[3];
2124                 }
2125                 break;
2126         case DPSOFTRAST_BLENDMODE_INVADD:
2127                 for (x = startx;x < endx;x++)
2128                 {
2129                         if (!pixelmask[x])
2130                                 continue;
2131                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2132                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2133                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2134                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2135                         pixel[x*4+0] = d[0];
2136                         pixel[x*4+1] = d[1];
2137                         pixel[x*4+2] = d[2];
2138                         pixel[x*4+3] = d[3];
2139                 }
2140                 break;
2141         }
2142 }
2143
2144 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2145 {
2146 #ifdef SSE2_PRESENT
2147         int x;
2148         int startx = span->startx;
2149         int endx = span->endx;
2150         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2151         unsigned char * RESTRICT pixelmask = span->pixelmask;
2152         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2153         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2154         if (!pixel)
2155                 return;
2156         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2157         pixeli += span->y * dpsoftrast.fb_width + span->x;
2158         // handle alphatest now (this affects depth writes too)
2159         if (thread->alphatest)
2160                 for (x = startx;x < endx;x++)
2161                         if (in4ub[x*4+3] < 0.5f)
2162                                 pixelmask[x] = false;
2163         // FIXME: this does not handle bigendian
2164         switch(thread->fb_blendmode)
2165         {
2166         case DPSOFTRAST_BLENDMODE_OPAQUE:
2167                 for (x = startx;x + 4 <= endx;)
2168                 {
2169                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2170                         {
2171                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172                                 x += 4;
2173                         }
2174                         else
2175                         {
2176                                 if (pixelmask[x])
2177                                         pixeli[x] = ini[x];
2178                                 x++;
2179                         }
2180                 }
2181                 for (;x < endx;x++)
2182                         if (pixelmask[x])
2183                                 pixeli[x] = ini[x];
2184                 break;
2185         case DPSOFTRAST_BLENDMODE_ALPHA:
2186         #define FINISHBLEND(blend2, blend1) \
2187                 for (x = startx;x + 1 < endx;x += 2) \
2188                 { \
2189                         __m128i src, dst; \
2190                         switch (*(const unsigned short*)&pixelmask[x]) \
2191                         { \
2192                         case 0x0101: \
2193                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2194                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2195                                 blend2; \
2196                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2197                                 continue; \
2198                         case 0x0100: \
2199                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2200                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2201                                 blend1; \
2202                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2203                                 continue; \
2204                         case 0x0001: \
2205                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2207                                 blend1; \
2208                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2209                                 continue; \
2210                         } \
2211                         break; \
2212                 } \
2213                 for(;x < endx; x++) \
2214                 { \
2215                         __m128i src, dst; \
2216                         if (!pixelmask[x]) \
2217                                 continue; \
2218                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                         blend1; \
2221                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                 }
2223
2224                 FINISHBLEND({
2225                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                 }, {
2228                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                 });
2231                 break;
2232         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                 FINISHBLEND({
2234                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                 }, {
2237                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                 });
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_ADD:
2242                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                 break;
2244         case DPSOFTRAST_BLENDMODE_INVMOD:
2245                 FINISHBLEND({
2246                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                 }, {
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_MUL:
2252                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_MUL2:
2255                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                 break;
2257         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                 FINISHBLEND({
2259                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                 }, {
2262                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                 });
2265                 break;
2266         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                 FINISHBLEND({
2268                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                 }, {
2271                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                 });
2274                 break;
2275         case DPSOFTRAST_BLENDMODE_INVADD:
2276                 FINISHBLEND({
2277                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                 }, {
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 });
2281                 break;
2282         }
2283 #endif
2284 }
2285
2286 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2287 {
2288         int x;
2289         int startx = span->startx;
2290         int endx = span->endx;
2291         int flags;
2292         float c[4];
2293         float data[4];
2294         float slope[4];
2295         float tc[2], endtc[2];
2296         float tcscale[2];
2297         unsigned int tci[2];
2298         unsigned int tci1[2];
2299         unsigned int tcimin[2];
2300         unsigned int tcimax[2];
2301         int tciwrapmask[2];
2302         int tciwidth;
2303         int filter;
2304         int mip;
2305         const unsigned char * RESTRICT pixelbase;
2306         const unsigned char * RESTRICT pixel[4];
2307         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2308         // if no texture is bound, just fill it with white
2309         if (!texture)
2310         {
2311                 for (x = startx;x < endx;x++)
2312                 {
2313                         out4f[x*4+0] = 1.0f;
2314                         out4f[x*4+1] = 1.0f;
2315                         out4f[x*4+2] = 1.0f;
2316                         out4f[x*4+3] = 1.0f;
2317                 }
2318                 return;
2319         }
2320         mip = triangle->mip[texunitindex];
2321         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322         // if this mipmap of the texture is 1 pixel, just fill it with that color
2323         if (texture->mipmap[mip][1] == 4)
2324         {
2325                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2326                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2327                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2328                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2329                 for (x = startx;x < endx;x++)
2330                 {
2331                         out4f[x*4+0] = c[0];
2332                         out4f[x*4+1] = c[1];
2333                         out4f[x*4+2] = c[2];
2334                         out4f[x*4+3] = c[3];
2335                 }
2336                 return;
2337         }
2338         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2339         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2340         flags = texture->flags;
2341         tcscale[0] = texture->mipmap[mip][2];
2342         tcscale[1] = texture->mipmap[mip][3];
2343         tciwidth = texture->mipmap[mip][2];
2344         tcimin[0] = 0;
2345         tcimin[1] = 0;
2346         tcimax[0] = texture->mipmap[mip][2]-1;
2347         tcimax[1] = texture->mipmap[mip][3]-1;
2348         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2349         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2350         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2351         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2352         for (x = startx;x < endx;)
2353         {
2354                 unsigned int subtc[2];
2355                 unsigned int substep[2];
2356                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2357                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2358                 if (nextsub >= endx)
2359                 {
2360                         nextsub = endsub = endx-1;      
2361                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2362                 }
2363                 tc[0] = endtc[0];
2364                 tc[1] = endtc[1];
2365                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2366                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2367                 substep[0] = (endtc[0] - tc[0]) * subscale;
2368                 substep[1] = (endtc[1] - tc[1]) * subscale;
2369                 subtc[0] = tc[0] * (1<<16);
2370                 subtc[1] = tc[1] * (1<<16);
2371                 if (filter)
2372                 {
2373                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2374                         {
2375                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2376                                 {
2377                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2378                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2379                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2380                                         tci[0] = subtc[0]>>16;
2381                                         tci[1] = subtc[1]>>16;
2382                                         tci1[0] = tci[0] + 1;
2383                                         tci1[1] = tci[1] + 1;
2384                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2385                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2386                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2387                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2388                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2389                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2390                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2391                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2392                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2393                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2394                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         out4f[x*4+0] = c[0];
2397                                         out4f[x*4+1] = c[1];
2398                                         out4f[x*4+2] = c[2];
2399                                         out4f[x*4+3] = c[3];
2400                                 }
2401                         }
2402                         else
2403                         {
2404                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2405                                 {
2406                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2407                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2408                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2409                                         tci[0] = subtc[0]>>16;
2410                                         tci[1] = subtc[1]>>16;
2411                                         tci1[0] = tci[0] + 1;
2412                                         tci1[1] = tci[1] + 1;
2413                                         tci[0] &= tciwrapmask[0];
2414                                         tci[1] &= tciwrapmask[1];
2415                                         tci1[0] &= tciwrapmask[0];
2416                                         tci1[1] &= tciwrapmask[1];
2417                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2419                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2420                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2421                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2422                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2423                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         out4f[x*4+0] = c[0];
2426                                         out4f[x*4+1] = c[1];
2427                                         out4f[x*4+2] = c[2];
2428                                         out4f[x*4+3] = c[3];
2429                                 }
2430                         }
2431                 }
2432                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2433                 {
2434                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2435                         {
2436                                 tci[0] = subtc[0]>>16;
2437                                 tci[1] = subtc[1]>>16;
2438                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2439                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2440                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2442                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2443                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2444                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2445                                 out4f[x*4+0] = c[0];
2446                                 out4f[x*4+1] = c[1];
2447                                 out4f[x*4+2] = c[2];
2448                                 out4f[x*4+3] = c[3];
2449                         }
2450                 }
2451                 else
2452                 {
2453                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2454                         {
2455                                 tci[0] = subtc[0]>>16;
2456                                 tci[1] = subtc[1]>>16;
2457                                 tci[0] &= tciwrapmask[0];
2458                                 tci[1] &= tciwrapmask[1];
2459                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464                                 out4f[x*4+0] = c[0];
2465                                 out4f[x*4+1] = c[1];
2466                                 out4f[x*4+2] = c[2];
2467                                 out4f[x*4+3] = c[3];
2468                         }
2469                 }
2470         }
2471 }
2472
2473 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2474 {
2475 #ifdef SSE2_PRESENT
2476         int x;
2477         int startx = span->startx;
2478         int endx = span->endx;
2479         int flags;
2480         __m128 data, slope, tcscale;
2481         __m128i tcsize, tcmask, tcoffset, tcmax;
2482         __m128 tc, endtc;
2483         __m128i subtc, substep, endsubtc;
2484         int filter;
2485         int mip;
2486         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2487         const unsigned char * RESTRICT pixelbase;
2488         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2489         // if no texture is bound, just fill it with white
2490         if (!texture)
2491         {
2492                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2493                 return;
2494         }
2495         mip = triangle->mip[texunitindex];
2496         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2497         // if this mipmap of the texture is 1 pixel, just fill it with that color
2498         if (texture->mipmap[mip][1] == 4)
2499         {
2500                 unsigned int k = *((const unsigned int *)pixelbase);
2501                 for (x = startx;x < endx;x++)
2502                         outi[x] = k;
2503                 return;
2504         }
2505         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2506         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2507         flags = texture->flags;
2508         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2509         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2510         tcscale = _mm_cvtepi32_ps(tcsize);
2511         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2512         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2513         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2514         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2515         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2516         tcmax = _mm_packs_epi32(tcmask, tcmask);
2517         for (x = startx;x < endx;)
2518         {
2519                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2520                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2521                 if (nextsub >= endx)
2522                 {
2523                         nextsub = endsub = endx-1;
2524                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2525                 }       
2526                 tc = endtc;
2527                 subtc = endsubtc;
2528                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2529                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2530                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2531                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2532                 substep = _mm_slli_epi32(substep, 1);
2533                 if (filter)
2534                 {
2535                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2536                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2537                         {
2538                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2539                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2540                                 {
2541                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2542                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2543                                         tci = _mm_madd_epi16(tci, tcoffset);
2544                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2545                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2546                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2547                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2548                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2549                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2550                                         fracm = _mm_srli_epi16(subtc, 1);
2551                                         pix1 = _mm_add_epi16(pix1,
2552                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2553                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2554                                         pix3 = _mm_add_epi16(pix3,
2555                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2556                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2557                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2558                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2559                                         pix2 = _mm_add_epi16(pix2,
2560                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2561                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2562                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2563                                 }
2564                                 if (x <= endsub)
2565                                 {
2566                                         const unsigned char * RESTRICT ptr1;
2567                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2568                                         tci = _mm_madd_epi16(tci, tcoffset);
2569                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2570                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2571                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2572                                         fracm = _mm_srli_epi16(subtc, 1);
2573                                         pix1 = _mm_add_epi16(pix1,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2577                                         pix1 = _mm_add_epi16(pix1,
2578                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2580                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2581                                         x++;
2582                                 }
2583                         }
2584                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2585                         {
2586                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2587                                 {
2588                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2589                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590                                         tci = _mm_madd_epi16(tci, tcoffset);
2591                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593                                                                                         _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596                                                                                         _mm_setzero_si128());
2597                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2598                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2599                                         tci = _mm_madd_epi16(tci, tcoffset);
2600                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602                                                                                         _mm_setzero_si128());
2603                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         fracm = _mm_srli_epi16(subtc, 1);
2607                                         pix1 = _mm_add_epi16(pix1,
2608                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2610                                         pix3 = _mm_add_epi16(pix3,
2611                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2612                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2613                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2614                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2615                                         pix2 = _mm_add_epi16(pix2,
2616                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2617                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2618                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2619                                 }
2620                                 if (x <= endsub)
2621                                 {
2622                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2623                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2627                                                                                         _mm_setzero_si128());
2628                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2630                                                                                         _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2636                                         pix1 = _mm_add_epi16(pix1,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2638                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2639                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2640                                         x++;
2641                                 }
2642                         }
2643                         else
2644                         {
2645                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2646                                 {
2647                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2648                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2657                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661                                                                                         _mm_setzero_si128());
2662                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         fracm = _mm_srli_epi16(subtc, 1);
2666                                         pix1 = _mm_add_epi16(pix1,
2667                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2669                                         pix3 = _mm_add_epi16(pix3,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2671                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2672                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2673                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2674                                         pix2 = _mm_add_epi16(pix2,
2675                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2676                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2677                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2678                                 }
2679                                 if (x <= endsub)
2680                                 {
2681                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2682                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2683                                         tci = _mm_madd_epi16(tci, tcoffset);
2684                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         fracm = _mm_srli_epi16(subtc, 1);
2691                                         pix1 = _mm_add_epi16(pix1,
2692                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2693                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2694                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2695                                         pix1 = _mm_add_epi16(pix1,
2696                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2697                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2698                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2699                                         x++;
2700                                 }
2701                         }
2702                 }
2703                 else
2704                 {
2705                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2706                         {
2707                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2708                                 {
2709                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2710                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2714                                 }
2715                                 if (x <= endsub)
2716                                 {
2717                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2718                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2719                                         tci = _mm_madd_epi16(tci, tcoffset);
2720                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2721                                         x++;
2722                                 }
2723                         }
2724                         else
2725                         {
2726                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2727                                 {
2728                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729                                         tci = _mm_and_si128(tci, tcmax); 
2730                                         tci = _mm_madd_epi16(tci, tcoffset);
2731                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2733                                 }
2734                                 if (x <= endsub)
2735                                 {
2736                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737                                         tci = _mm_and_si128(tci, tcmax); 
2738                                         tci = _mm_madd_epi16(tci, tcoffset);
2739                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2740                                         x++;
2741                                 }
2742                         }
2743                 }
2744         }
2745 #endif
2746 }
2747
2748 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2749 {
2750         // TODO: IMPLEMENT
2751         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2752 }
2753
2754 float DPSOFTRAST_SampleShadowmap(const float *vector)
2755 {
2756         // TODO: IMPLEMENT
2757         return 1.0f;
2758 }
2759
2760 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2761 {
2762         int x;
2763         int startx = span->startx;
2764         int endx = span->endx;
2765         float c[4];
2766         float data[4];
2767         float slope[4];
2768         float z;
2769         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2770         for (x = startx;x < endx;x++)
2771         {
2772                 z = zf[x];
2773                 c[0] = (data[0] + slope[0]*x) * z;
2774                 c[1] = (data[1] + slope[1]*x) * z;
2775                 c[2] = (data[2] + slope[2]*x) * z;
2776                 c[3] = (data[3] + slope[3]*x) * z;
2777                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2778                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2779                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2780                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2781         }
2782 }
2783
2784 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2785 {
2786         int x;
2787         int startx = span->startx;
2788         int endx = span->endx;
2789         float c[4];
2790         float data[4];
2791         float slope[4];
2792         float z;
2793         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2794         for (x = startx;x < endx;x++)
2795         {
2796                 z = zf[x];
2797                 c[0] = (data[0] + slope[0]*x) * z;
2798                 c[1] = (data[1] + slope[1]*x) * z;
2799                 c[2] = (data[2] + slope[2]*x) * z;
2800                 c[3] = (data[3] + slope[3]*x) * z;
2801                 out4f[x*4+0] = c[0];
2802                 out4f[x*4+1] = c[1];
2803                 out4f[x*4+2] = c[2];
2804                 out4f[x*4+3] = c[3];
2805         }
2806 }
2807
2808 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2809 {
2810         int x, startx = span->startx, endx = span->endx;
2811         float c[4], localcolor[4];
2812         localcolor[0] = subcolor[0];
2813         localcolor[1] = subcolor[1];
2814         localcolor[2] = subcolor[2];
2815         localcolor[3] = subcolor[3];
2816         for (x = startx;x < endx;x++)
2817         {
2818                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2819                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2820                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2821                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2822                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2823                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2824                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2825                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2826         }
2827 }
2828
2829 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2830 {
2831         int x, startx = span->startx, endx = span->endx;
2832         for (x = startx;x < endx;x++)
2833         {
2834                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2835                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2836                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2837                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2838         }
2839 }
2840
2841 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2842 {
2843         int x, startx = span->startx, endx = span->endx;
2844         for (x = startx;x < endx;x++)
2845         {
2846                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2847                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2848                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2849                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2850         }
2851 }
2852
2853 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2854 {
2855         int x, startx = span->startx, endx = span->endx;
2856         float a, b;
2857         for (x = startx;x < endx;x++)
2858         {
2859                 a = 1.0f - inb4f[x*4+3];
2860                 b = inb4f[x*4+3];
2861                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2862                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2863                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2864                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         float localcolor[4], ilerp, lerp;
2872         localcolor[0] = color[0];
2873         localcolor[1] = color[1];
2874         localcolor[2] = color[2];
2875         localcolor[3] = color[3];
2876         ilerp = 1.0f - localcolor[3];
2877         lerp = localcolor[3];
2878         for (x = startx;x < endx;x++)
2879         {
2880                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2881                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2882                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2883                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2884         }
2885 }
2886
2887
2888
2889 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2890 {
2891 #ifdef SSE2_PRESENT
2892         int x;
2893         int startx = span->startx;
2894         int endx = span->endx;
2895         __m128 data, slope;
2896         __m128 mod, endmod;
2897         __m128i submod, substep, endsubmod;
2898         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2899         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2900         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2901         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2902         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2903         for (x = startx; x < endx;)
2904         {
2905                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2906                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2907                 if (nextsub >= endx)
2908                 {
2909                         nextsub = endsub = endx-1;
2910                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2911                 }
2912                 mod = endmod;
2913                 submod = endsubmod;
2914                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2915                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2916                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2917                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2918                 substep = _mm_packs_epi32(substep, substep);
2919                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2920                 {
2921                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2922                         pix = _mm_mulhi_epu16(pix, submod);
2923                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2924                 }
2925                 if (x <= endsub)
2926                 {
2927                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2928                         pix = _mm_mulhi_epu16(pix, submod);
2929                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2930                         x++;
2931                 }
2932         }
2933 #endif
2934 }
2935
2936 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2937 {
2938 #ifdef SSE2_PRESENT
2939         int x;
2940         int startx = span->startx;
2941         int endx = span->endx;
2942         __m128 data, slope;
2943         __m128 mod, endmod;
2944         __m128i submod, substep, endsubmod;
2945         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2946         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2947         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2948         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2949         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2950         for (x = startx; x < endx;)
2951         {
2952                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2953                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2954                 if (nextsub >= endx)
2955                 {
2956                         nextsub = endsub = endx-1;
2957                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2958                 }
2959                 mod = endmod;
2960                 submod = endsubmod;
2961                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2962                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2963                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2964                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2965                 substep = _mm_packs_epi32(substep, substep);
2966                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2967                 {
2968                         __m128i pix = _mm_srai_epi16(submod, 4);
2969                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2970            &