WGL client can now use DPSOFTRAST, added thread_win.c to avoid SDL dependency for...
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 }
192 DPSOFTRAST_State_Span);
193
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
196
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
201
202 typedef enum DPSOFTRAST_BLENDMODE_e
203 {
204         DPSOFTRAST_BLENDMODE_OPAQUE,
205         DPSOFTRAST_BLENDMODE_ALPHA,
206         DPSOFTRAST_BLENDMODE_ADDALPHA,
207         DPSOFTRAST_BLENDMODE_ADD,
208         DPSOFTRAST_BLENDMODE_INVMOD,
209         DPSOFTRAST_BLENDMODE_MUL,
210         DPSOFTRAST_BLENDMODE_MUL2,
211         DPSOFTRAST_BLENDMODE_SUBALPHA,
212         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213         DPSOFTRAST_BLENDMODE_INVADD,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220         void *thread;
221         int index;
222         
223         int cullface;
224         int colormask[4];
225         int blendfunc[2];
226         int blendsubtract;
227         int depthmask;
228         int depthtest;
229         int depthfunc;
230         int scissortest;
231         int alphatest;
232         int alphafunc;
233         float alphavalue;
234         int viewport[4];
235         int scissor[4];
236         float depthrange[2];
237         float polygonoffset[2];
238
239         int shader_mode;
240         int shader_permutation;
241         int shader_exactspecularmath;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_scissor[4];
254         ALIGN(float fb_viewportcenter[4]);
255         ALIGN(float fb_viewportscale[4]);
256
257         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
258         int fb_depthfunc;
259
260         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
261         int fb_blendmode;
262
263         // band boundaries
264         int miny1;
265         int maxy1;
266         int miny2;
267         int maxy2;
268
269         ATOMIC(volatile int commandoffset);
270
271         volatile bool waiting;
272         volatile bool starving;
273         void *waitcond;
274         void *drawcond;
275         void *drawmutex;
276
277         int numspans;
278         int numtriangles;
279         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
281 }
282 DPSOFTRAST_State_Thread);
283
284 typedef ATOMIC(struct DPSOFTRAST_State_s
285 {
286         int fb_width;
287         int fb_height;
288         unsigned int *fb_depthpixels;
289         unsigned int *fb_colorpixels[4];
290
291         int viewport[4];
292         ALIGN(float fb_viewportcenter[4]);
293         ALIGN(float fb_viewportscale[4]);
294
295         float color[4];
296         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
298
299         const float *pointer_vertex3f;
300         const float *pointer_color4f;
301         const unsigned char *pointer_color4ub;
302         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         int stride_vertex;
304         int stride_color;
305         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
308
309         int firstvertex;
310         int numvertices;
311         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312         float *screencoord4f;
313         int drawstarty;
314         int drawendy;
315         int drawclipped;
316         
317         int shader_mode;
318         int shader_permutation;
319         int shader_exactspecularmath;
320
321         int texture_max;
322         int texture_end;
323         int texture_firstfree;
324         DPSOFTRAST_Texture *texture;
325
326         int bigendian;
327
328         // error reporting
329         const char *errorstring;
330
331         bool usethreads;
332         int interlace;
333         int numthreads;
334         DPSOFTRAST_State_Thread *threads;
335
336         ATOMIC(volatile int drawcommand);
337
338         DPSOFTRAST_State_Command_Pool commandpool;
339 }
340 DPSOFTRAST_State);
341
342 DPSOFTRAST_State dpsoftrast;
343
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
349
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
351 {
352         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354         fb_viewportcenter[3] = 0.5f;
355         fb_viewportcenter[0] = 0.0f;
356         fb_viewportscale[1] = 0.5f * viewport[2];
357         fb_viewportscale[2] = -0.5f * viewport[3];
358         fb_viewportscale[3] = 0.5f;
359         fb_viewportscale[0] = 1.0f;
360 }
361
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
363 {
364         if (dpsoftrast.interlace)
365         {
366                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370         }
371         else
372         {
373                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375         }
376 }
377
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
379 {
380         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381         // and viewport projection values
382         int x1, x2;
383         int y1, y2;
384         x1 = thread->scissor[0];
385         x2 = thread->scissor[0] + thread->scissor[2];
386         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387         y2 = dpsoftrast.fb_height - thread->scissor[1];
388         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
389         if (x1 < 0) x1 = 0;
390         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
391         if (y1 < 0) y1 = 0;
392         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393         thread->fb_scissor[0] = x1;
394         thread->fb_scissor[1] = y1;
395         thread->fb_scissor[2] = x2 - x1;
396         thread->fb_scissor[3] = y2 - y1;
397
398         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399         DPSOFTRAST_RecalcThread(thread);
400 }
401
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
403 {
404         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
405 }
406
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
408 {
409         if (thread->blendsubtract)
410         {
411                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
412                 {
413                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417                 }
418         }
419         else
420         {       
421                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
422                 {
423                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434                 }
435         }
436 }
437
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
439
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
441 {
442         mask &= thread->validate;
443         if (!mask)
444                 return;
445         if (mask & DPSOFTRAST_VALIDATE_FB)
446         {
447                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448                 DPSOFTRAST_RecalcFB(thread);
449         }
450         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
451         {
452                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453                 DPSOFTRAST_RecalcDepthFunc(thread);
454         }
455         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
456         {
457                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458                 DPSOFTRAST_RecalcBlendFunc(thread);
459         }
460 }
461
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
463 {
464         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465                 return &dpsoftrast.texture[index];
466         return NULL;
467 }
468
469 static void DPSOFTRAST_Texture_Grow(void)
470 {
471         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472         DPSOFTRAST_State_Thread *thread;
473         int i;
474         int j;
475         DPSOFTRAST_Flush();
476         // expand texture array as needed
477         if (dpsoftrast.texture_max < 1024)
478                 dpsoftrast.texture_max = 1024;
479         else
480                 dpsoftrast.texture_max *= 2;
481         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483                 if (dpsoftrast.texbound[i])
484                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485         for (j = 0; j < dpsoftrast.numthreads; j++)
486         {
487                 thread = &dpsoftrast.threads[j];
488                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489                         if (thread->texbound[i])
490                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
491         }
492 }
493
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
495 {
496         int w;
497         int h;
498         int d;
499         int size;
500         int s;
501         int texnum;
502         int mipmaps;
503         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505         DPSOFTRAST_Texture *texture;
506         if (width*height*depth < 1)
507         {
508                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
509                 return 0;
510         }
511         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
512         {
513                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514                 return 0;
515         }
516         switch(texformat)
517         {
518         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
521                 break;
522         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
524                 {
525                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526                         return 0;
527                 }
528                 if (depth != 1)
529                 {
530                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531                         return 0;
532                 }
533                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
534                 {
535                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536                         return 0;
537                 }
538                 break;
539         }
540         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
541         {
542                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
543                 return 0;
544         }
545         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
546         {
547                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
548                 return 0;
549         }
550         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
551         {
552                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
553                 return 0;
554         }
555         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
556         {
557                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
558                 return 0;
559         }
560         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
561         {
562                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
563                 return 0;
564         }
565         // find first empty slot in texture array
566         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567                 if (!dpsoftrast.texture[texnum].bytes)
568                         break;
569         dpsoftrast.texture_firstfree = texnum + 1;
570         if (dpsoftrast.texture_max <= texnum)
571                 DPSOFTRAST_Texture_Grow();
572         if (dpsoftrast.texture_end <= texnum)
573                 dpsoftrast.texture_end = texnum + 1;
574         texture = &dpsoftrast.texture[texnum];
575         memset(texture, 0, sizeof(*texture));
576         texture->flags = flags;
577         texture->width = width;
578         texture->height = height;
579         texture->depth = depth;
580         texture->sides = sides;
581         texture->binds = 0;
582         w = width;
583         h = height;
584         d = depth;
585         size = 0;
586         mipmaps = 0;
587         w = width;
588         h = height;
589         d = depth;
590         for (;;)
591         {
592                 s = w * h * d * sides * 4;
593                 texture->mipmap[mipmaps][0] = size;
594                 texture->mipmap[mipmaps][1] = s;
595                 texture->mipmap[mipmaps][2] = w;
596                 texture->mipmap[mipmaps][3] = h;
597                 texture->mipmap[mipmaps][4] = d;
598                 size += s;
599                 mipmaps++;
600                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
601                         break;
602                 if (w > 1) w >>= 1;
603                 if (h > 1) h >>= 1;
604                 if (d > 1) d >>= 1;
605         }
606         texture->mipmaps = mipmaps;
607         texture->size = size;
608
609         // allocate the pixels now
610         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
611
612         return texnum;
613 }
614 void DPSOFTRAST_Texture_Free(int index)
615 {
616         DPSOFTRAST_Texture *texture;
617         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
618         if (texture->binds)
619                 DPSOFTRAST_Flush();
620         if (texture->bytes)
621                 MM_FREE(texture->bytes);
622         texture->bytes = NULL;
623         memset(texture, 0, sizeof(*texture));
624         // adjust the free range and used range
625         if (dpsoftrast.texture_firstfree > index)
626                 dpsoftrast.texture_firstfree = index;
627         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628                 dpsoftrast.texture_end--;
629 }
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
631 {
632         int i, x, y, z, w, layer0, layer1, row0, row1;
633         unsigned char *o, *i0, *i1, *i2, *i3;
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->mipmaps <= 1)
637                 return;
638         for (i = 1;i < texture->mipmaps;i++)
639         {
640                 for (z = 0;z < texture->mipmap[i][4];z++)
641                 {
642                         layer0 = z*2;
643                         layer1 = z*2+1;
644                         if (layer1 >= texture->mipmap[i-1][4])
645                                 layer1 = texture->mipmap[i-1][4]-1;
646                         for (y = 0;y < texture->mipmap[i][3];y++)
647                         {
648                                 row0 = y*2;
649                                 row1 = y*2+1;
650                                 if (row1 >= texture->mipmap[i-1][3])
651                                         row1 = texture->mipmap[i-1][3]-1;
652                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
653                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657                                 w = texture->mipmap[i][2];
658                                 if (layer1 > layer0)
659                                 {
660                                         if (texture->mipmap[i-1][2] > 1)
661                                         {
662                                                 // average 3D texture
663                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
664                                                 {
665                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669                                                 }
670                                         }
671                                         else
672                                         {
673                                                 // average 3D mipmap with parent width == 1
674                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
675                                                 {
676                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
680                                                 }
681                                         }
682                                 }
683                                 else
684                                 {
685                                         if (texture->mipmap[i-1][2] > 1)
686                                         {
687                                                 // average 2D texture (common case)
688                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
689                                                 {
690                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694                                                 }
695                                         }
696                                         else
697                                         {
698                                                 // 2D texture with parent width == 1
699                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
700                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
701                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
702                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
703                                         }
704                                 }
705                         }
706                 }
707         }
708 }
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
710 {
711         DPSOFTRAST_Texture *texture;
712         unsigned char *dst;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714         if (texture->binds)
715                 DPSOFTRAST_Flush();
716         if (pixels)
717         {
718                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719                 while (blockheight > 0)
720                 {
721                         memcpy(dst, pixels, blockwidth * 4);
722                         pixels += blockwidth * 4;
723                         dst += texture->mipmap[0][2] * 4;
724                         blockheight--;
725                 }
726         }
727         DPSOFTRAST_Texture_CalculateMipmaps(index);
728 }
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         if (pixels)
736                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737         DPSOFTRAST_Texture_CalculateMipmaps(index);
738 }
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
740 {
741         DPSOFTRAST_Texture *texture;
742         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743         return texture->mipmap[mip][2];
744 }
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
746 {
747         DPSOFTRAST_Texture *texture;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749         return texture->mipmap[mip][3];
750 }
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755         return texture->mipmap[mip][4];
756 }
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         if (texture->binds)
762                 DPSOFTRAST_Flush();
763         return texture->bytes + texture->mipmap[mip][0];
764 }
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
770         {
771                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772                 return;
773         }
774         if (texture->binds)
775                 DPSOFTRAST_Flush();
776         texture->filter = filter;
777 }
778
779 static void DPSOFTRAST_Draw_FlushThreads(void);
780
781 static void DPSOFTRAST_Draw_SyncCommands(void)
782 {
783         if(dpsoftrast.usethreads) MEMORY_BARRIER;
784         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
785 }
786
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
788 {
789         DPSOFTRAST_State_Thread *thread;
790         int i;
791         int freecommand = dpsoftrast.commandpool.freecommand;
792         int usedcommands = dpsoftrast.commandpool.usedcommands;
793         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
794                 return;
795         DPSOFTRAST_Draw_SyncCommands();
796         for(;;)
797         {
798                 int waitindex = -1;
799                 int commandoffset;
800                 usedcommands = 0;
801                 for (i = 0; i < dpsoftrast.numthreads; i++)
802                 {
803                         thread = &dpsoftrast.threads[i]; 
804                         commandoffset = freecommand - thread->commandoffset;
805                         if (commandoffset < 0)
806                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807                         if (commandoffset > usedcommands)
808                         {
809                                 waitindex = i;
810                                 usedcommands = commandoffset;
811                         }
812                 }
813                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
814                         break;
815                 thread = &dpsoftrast.threads[waitindex];
816                 Thread_LockMutex(thread->drawmutex);
817                 if (thread->commandoffset != dpsoftrast.drawcommand)
818                 {
819                         thread->waiting = true;
820                         if (thread->starving) Thread_CondSignal(thread->drawcond);
821                         Thread_CondWait(thread->waitcond, thread->drawmutex);
822                         thread->waiting = false;
823                 }
824                 Thread_UnlockMutex(thread->drawmutex);
825         }
826         dpsoftrast.commandpool.usedcommands = usedcommands;
827 }
828
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
833
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
835 {
836         DPSOFTRAST_Command *command;
837         int freecommand = dpsoftrast.commandpool.freecommand;
838         int usedcommands = dpsoftrast.commandpool.usedcommands;
839         int extra = sizeof(DPSOFTRAST_Command);
840         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
843         {
844                 if (dpsoftrast.usethreads)
845                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
846                 else
847                         DPSOFTRAST_Draw_FlushThreads();
848                 freecommand = dpsoftrast.commandpool.freecommand;
849                 usedcommands = dpsoftrast.commandpool.usedcommands;
850         }
851         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
852         {
853                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854                 command->opcode = DPSOFTRAST_OPCODE_Reset;
855                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
856                 freecommand = 0;
857         }
858         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859         command->opcode = opcode;
860         command->commandsize = size;
861         freecommand += size;
862         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
863                 freecommand = 0;
864         dpsoftrast.commandpool.freecommand = freecommand;
865         dpsoftrast.commandpool.usedcommands = usedcommands + size;
866         return command;
867 }
868
869 static void DPSOFTRAST_UndoCommand(int size)
870 {
871         int freecommand = dpsoftrast.commandpool.freecommand;
872         int usedcommands = dpsoftrast.commandpool.usedcommands;
873         freecommand -= size;
874         if (freecommand < 0)
875                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876         usedcommands -= size;
877         dpsoftrast.commandpool.freecommand = freecommand;
878         dpsoftrast.commandpool.usedcommands = usedcommands;
879 }
880                 
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
883 {
884         thread->viewport[0] = command->x;
885         thread->viewport[1] = command->y;
886         thread->viewport[2] = command->width;
887         thread->viewport[3] = command->height;
888         thread->validate |= DPSOFTRAST_VALIDATE_FB;
889 }
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
891 {
892         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
893         command->x = x;
894         command->y = y;
895         command->width = width;
896         command->height = height;
897
898         dpsoftrast.viewport[0] = x;
899         dpsoftrast.viewport[1] = y;
900         dpsoftrast.viewport[2] = width;
901         dpsoftrast.viewport[3] = height;
902         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
903 }
904
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
907 {
908         int i, x1, y1, x2, y2, w, h, x, y;
909         int miny1, maxy1, miny2, maxy2;
910         int bandy;
911         unsigned int *p;
912         unsigned int c;
913         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914         miny1 = thread->miny1;
915         maxy1 = thread->maxy1;
916         miny2 = thread->miny2;
917         maxy2 = thread->maxy2;
918         x1 = thread->fb_scissor[0];
919         y1 = thread->fb_scissor[1];
920         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922         if (y1 < miny1) y1 = miny1;
923         if (y2 > maxy2) y2 = maxy2;
924         w = x2 - x1;
925         h = y2 - y1;
926         if (w < 1 || h < 1)
927                 return;
928         // FIXME: honor fb_colormask?
929         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930         for (i = 0;i < 4;i++)
931         {
932                 if (!dpsoftrast.fb_colorpixels[i])
933                         continue;
934                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
935                 for (;y < bandy;y++)
936                 {
937                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938                         for (x = x1;x < x2;x++)
939                                 p[x] = c;
940                 }
941         }
942 }
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
944 {
945         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
946         command->r = r;
947         command->g = g;
948         command->b = b;
949         command->a = a;
950 }
951
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
954 {
955         int x1, y1, x2, y2, w, h, x, y;
956         int miny1, maxy1, miny2, maxy2;
957         int bandy;
958         unsigned int *p;
959         unsigned int c;
960         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961         miny1 = thread->miny1;
962         maxy1 = thread->maxy1;
963         miny2 = thread->miny2;
964         maxy2 = thread->maxy2;
965         x1 = thread->fb_scissor[0];
966         y1 = thread->fb_scissor[1];
967         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969         if (y1 < miny1) y1 = miny1;
970         if (y2 > maxy2) y2 = maxy2;
971         w = x2 - x1;
972         h = y2 - y1;
973         if (w < 1 || h < 1)
974                 return;
975         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977         for (;y < bandy;y++)
978         {
979                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980                 for (x = x1;x < x2;x++)
981                         p[x] = c;
982         }
983 }
984 void DPSOFTRAST_ClearDepth(float d)
985 {
986         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
987         command->depth = d;
988 }
989
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
992 {
993         thread->colormask[0] = command->r != 0;
994         thread->colormask[1] = command->g != 0;
995         thread->colormask[2] = command->b != 0;
996         thread->colormask[3] = command->a != 0;
997         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
998 }
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1000 {
1001         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1002         command->r = r;
1003         command->g = g;
1004         command->b = b;
1005         command->a = a;
1006 }
1007
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1010 {
1011         thread->depthtest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1013 }
1014 void DPSOFTRAST_DepthTest(int enable)
1015 {
1016         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1022 {
1023         thread->scissortest = command->enable;
1024         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 }
1026 void DPSOFTRAST_ScissorTest(int enable)
1027 {
1028         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029         command->enable = enable;
1030 }
1031
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1034 {
1035         thread->scissor[0] = command->x;
1036         thread->scissor[1] = command->y;
1037         thread->scissor[2] = command->width;
1038         thread->scissor[3] = command->height;
1039         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1040 }
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1042 {
1043         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1044         command->x = x;
1045         command->y = y;
1046         command->width = width;
1047         command->height = height;
1048 }
1049
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1052 {
1053         thread->blendfunc[0] = command->sfactor;
1054         thread->blendfunc[1] = command->dfactor;
1055         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 }
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1058 {
1059         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060         command->sfactor = sfactor;
1061         command->dfactor = dfactor;
1062 }
1063
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1066 {
1067         thread->blendsubtract = command->enable;
1068         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1069 }
1070 void DPSOFTRAST_BlendSubtract(int enable)
1071 {
1072         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073         command->enable = enable;
1074 }
1075
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1078 {
1079         thread->depthmask = command->enable;
1080 }
1081 void DPSOFTRAST_DepthMask(int enable)
1082 {
1083         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084         command->enable = enable;
1085 }
1086
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1089 {
1090         thread->depthfunc = command->func;
1091 }
1092 void DPSOFTRAST_DepthFunc(int func)
1093 {
1094         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095         command->func = func;
1096 }
1097
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1100 {
1101         thread->depthrange[0] = command->nearval;
1102         thread->depthrange[1] = command->farval;
1103 }
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1105 {
1106         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107         command->nearval = nearval;
1108         command->farval = farval;
1109 }
1110
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1113 {
1114         thread->polygonoffset[0] = command->alongnormal;
1115         thread->polygonoffset[1] = command->intoview;
1116 }
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1118 {
1119         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120         command->alongnormal = alongnormal;
1121         command->intoview = intoview;
1122 }
1123
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1126 {
1127         thread->cullface = command->mode;
1128 }
1129 void DPSOFTRAST_CullFace(int mode)
1130 {
1131         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132         command->mode = mode;
1133 }
1134
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1137 {
1138         thread->alphatest = command->enable;
1139 }
1140 void DPSOFTRAST_AlphaTest(int enable)
1141 {
1142         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143         command->enable = enable;
1144 }
1145
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1148 {
1149         thread->alphafunc = command->func;
1150         thread->alphavalue = command->ref;
1151 }
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1153 {
1154         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155         command->func = func;
1156         command->ref = ref;
1157 }
1158
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1160 {
1161         dpsoftrast.color[0] = r;
1162         dpsoftrast.color[1] = g;
1163         dpsoftrast.color[2] = b;
1164         dpsoftrast.color[3] = a;
1165 }
1166
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1168 {
1169         int outstride = blockwidth * 4;
1170         int instride = dpsoftrast.fb_width * 4;
1171         int bx1 = blockx;
1172         int by1 = blocky;
1173         int bx2 = blockx + blockwidth;
1174         int by2 = blocky + blockheight;
1175         int bw;
1176         int x;
1177         int y;
1178         unsigned char *inpixels;
1179         unsigned char *b;
1180         unsigned char *o;
1181         DPSOFTRAST_Flush();
1182         if (bx1 < 0) bx1 = 0;
1183         if (by1 < 0) by1 = 0;
1184         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1186         bw = bx2 - bx1;
1187         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188         if (dpsoftrast.bigendian)
1189         {
1190                 for (y = by1;y < by2;y++)
1191                 {
1192                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1194                         for (x = bx1;x < bx2;x++)
1195                         {
1196                                 o[0] = b[3];
1197                                 o[1] = b[2];
1198                                 o[2] = b[1];
1199                                 o[3] = b[0];
1200                                 o += 4;
1201                                 b += 4;
1202                         }
1203                 }
1204         }
1205         else
1206         {
1207                 for (y = by1;y < by2;y++)
1208                 {
1209                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1211                         memcpy(o, b, bw*4);
1212                 }
1213         }
1214
1215 }
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1217 {
1218         int tx1 = tx;
1219         int ty1 = ty;
1220         int tx2 = tx + width;
1221         int ty2 = ty + height;
1222         int sx1 = sx;
1223         int sy1 = sy;
1224         int sx2 = sx + width;
1225         int sy2 = sy + height;
1226         int swidth;
1227         int sheight;
1228         int twidth;
1229         int theight;
1230         int sw;
1231         int sh;
1232         int tw;
1233         int th;
1234         int y;
1235         unsigned int *spixels;
1236         unsigned int *tpixels;
1237         DPSOFTRAST_Texture *texture;
1238         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239         if (mip < 0 || mip >= texture->mipmaps) return;
1240         DPSOFTRAST_Flush();
1241         spixels = dpsoftrast.fb_colorpixels[0];
1242         swidth = dpsoftrast.fb_width;
1243         sheight = dpsoftrast.fb_height;
1244         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245         twidth = texture->mipmap[mip][2];
1246         theight = texture->mipmap[mip][3];
1247         if (tx1 < 0) tx1 = 0;
1248         if (ty1 < 0) ty1 = 0;
1249         if (tx2 > twidth) tx2 = twidth;
1250         if (ty2 > theight) ty2 = theight;
1251         if (sx1 < 0) sx1 = 0;
1252         if (sy1 < 0) sy1 = 0;
1253         if (sx2 > swidth) sx2 = swidth;
1254         if (sy2 > sheight) sy2 = sheight;
1255         tw = tx2 - tx1;
1256         th = ty2 - ty1;
1257         sw = sx2 - sx1;
1258         sh = sy2 - sy1;
1259         if (tw > sw) tw = sw;
1260         if (th > sh) th = sh;
1261         if (tw < 1 || th < 1)
1262                 return;
1263         sy1 = sheight - 1 - sy1;
1264         for (y = 0;y < th;y++)
1265                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266         if (texture->mipmaps > 1)
1267                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1268 }
1269
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1272 {
1273         if (thread->texbound[command->unitnum])
1274                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275         thread->texbound[command->unitnum] = command->texture;
1276 }
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1278 {
1279         DPSOFTRAST_Command_SetTexture *command;
1280         DPSOFTRAST_Texture *texture;
1281         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1282         {
1283                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1284                 return;
1285         }
1286         texture = DPSOFTRAST_Texture_GetByIndex(index);
1287         if (index && !texture)
1288         {
1289                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1290                 return;
1291         }
1292
1293         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294         command->unitnum = unitnum;
1295         command->texture = texture;
1296
1297         dpsoftrast.texbound[unitnum] = texture;
1298         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1299 }
1300
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1302 {
1303         dpsoftrast.pointer_vertex3f = vertex3f;
1304         dpsoftrast.stride_vertex = stride;
1305 }
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1307 {
1308         dpsoftrast.pointer_color4f = color4f;
1309         dpsoftrast.pointer_color4ub = NULL;
1310         dpsoftrast.stride_color = stride;
1311 }
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1313 {
1314         dpsoftrast.pointer_color4f = NULL;
1315         dpsoftrast.pointer_color4ub = color4ub;
1316         dpsoftrast.stride_color = stride;
1317 }
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1319 {
1320         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322         dpsoftrast.stride_texcoord[unitnum] = stride;
1323 }
1324
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1327 {
1328         thread->shader_mode = command->mode;
1329         thread->shader_permutation = command->permutation;
1330         thread->shader_exactspecularmath = command->exactspecularmath;
1331 }
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1333 {
1334         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335         command->mode = mode;
1336         command->permutation = permutation;
1337         command->exactspecularmath = exactspecularmath;
1338
1339         dpsoftrast.shader_mode = mode;
1340         dpsoftrast.shader_permutation = permutation;
1341         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1342 }
1343
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1346 {
1347         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1348 }
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1350 {
1351         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352         command->index = index;
1353         command->val[0] = v0;
1354         command->val[1] = v1;
1355         command->val[2] = v2;
1356         command->val[3] = v3;
1357
1358         dpsoftrast.uniform4f[index*4+0] = v0;
1359         dpsoftrast.uniform4f[index*4+1] = v1;
1360         dpsoftrast.uniform4f[index*4+2] = v2;
1361         dpsoftrast.uniform4f[index*4+3] = v3;
1362 }
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1364 {
1365         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366         command->index = index;
1367         memcpy(command->val, v, sizeof(command->val));
1368
1369         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1370 }
1371
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1374 {
1375         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1376 }
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1378 {
1379 #ifdef SSE_POSSIBLE
1380         int i, index;
1381         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1382         {
1383                 __m128 m0, m1, m2, m3;
1384                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385                 command->index = (DPSOFTRAST_UNIFORM)index;
1386                 if (((size_t)v)&(ALIGN_SIZE-1))
1387                 {
1388                         m0 = _mm_loadu_ps(v);
1389                         m1 = _mm_loadu_ps(v+4);
1390                         m2 = _mm_loadu_ps(v+8);
1391                         m3 = _mm_loadu_ps(v+12);
1392                 }
1393                 else
1394                 {
1395                         m0 = _mm_load_ps(v);
1396                         m1 = _mm_load_ps(v+4);
1397                         m2 = _mm_load_ps(v+8);
1398                         m3 = _mm_load_ps(v+12);
1399                 }
1400                 if (transpose)
1401                 {
1402                         __m128 t0, t1, t2, t3;
1403                         t0 = _mm_unpacklo_ps(m0, m1);
1404                         t1 = _mm_unpacklo_ps(m2, m3);
1405                         t2 = _mm_unpackhi_ps(m0, m1);
1406                         t3 = _mm_unpackhi_ps(m2, m3);
1407                         m0 = _mm_movelh_ps(t0, t1);
1408                         m1 = _mm_movehl_ps(t1, t0);
1409                         m2 = _mm_movelh_ps(t2, t3);
1410                         m3 = _mm_movehl_ps(t3, t2);                     
1411                 }
1412                 _mm_store_ps(command->val, m0);
1413                 _mm_store_ps(command->val+4, m1);
1414                 _mm_store_ps(command->val+8, m2);
1415                 _mm_store_ps(command->val+12, m3);
1416                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420         }
1421 #endif
1422 }
1423
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1426 {
1427         thread->uniform1i[command->index] = command->val;
1428 }
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1430 {
1431         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432         command->index = index;
1433         command->val = i0;
1434
1435         dpsoftrast.uniform1i[command->index] = i0;
1436 }
1437
1438 #ifdef SSE_POSSIBLE
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 {
1441         float *end = dst + size*4;
1442         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1443         {
1444                 while (dst < end)
1445                 {
1446                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447                         dst += 4;
1448                         src += stride;
1449                 }
1450         }
1451         else
1452         {
1453                 while (dst < end)
1454                 {
1455                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1456                         dst += 4;
1457                         src += stride;
1458                 }
1459         }
1460 }
1461
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1463 {
1464         float *end = dst + size*4;
1465         if (stride == sizeof(float[3]))
1466         {
1467                 float *end4 = dst + (size&~3)*4;        
1468                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1469                 {
1470                         while (dst < end4)
1471                         {
1472                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1473                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dst += 16;
1486                                 src += 4*sizeof(float[3]);
1487                         }
1488                 }
1489                 else
1490                 {
1491                         while (dst < end4)
1492                         {
1493                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506                                 dst += 16;
1507                                 src += 4*sizeof(float[3]);
1508                         }
1509                 }
1510         }
1511         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1512         {
1513                 while (dst < end)
1514                 {
1515                         __m128 v = _mm_loadu_ps((const float *)src);
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519                         _mm_store_ps(dst, v);
1520                         dst += 4;
1521                         src += stride;
1522                 }
1523         }
1524         else
1525         {
1526                 while (dst < end)
1527                 {
1528                         __m128 v = _mm_load_ps((const float *)src);
1529                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532                         _mm_store_ps(dst, v);
1533                         dst += 4;
1534                         src += stride;
1535                 }
1536         }
1537 }
1538
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1540 {
1541         float *end = dst + size*4;
1542         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543         if (stride == sizeof(float[2]))
1544         {
1545                 float *end2 = dst + (size&~1)*4;
1546                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1547                 {
1548                         while (dst < end2)
1549                         {
1550                                 __m128 v = _mm_loadu_ps((const float *)src);
1551                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1553                                 dst += 8;
1554                                 src += 2*sizeof(float[2]);
1555                         }
1556                 }
1557                 else
1558                 {
1559                         while (dst < end2)
1560                         {
1561                                 __m128 v = _mm_load_ps((const float *)src);
1562                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1564                                 dst += 8;
1565                                 src += 2*sizeof(float[2]);
1566                         }
1567                 }
1568         }
1569         while (dst < end)
1570         {
1571                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1572                 dst += 4;
1573                 src += stride;
1574         }
1575 }
1576
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 {
1579         float *end = dst + size*4;
1580         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581         if (stride == sizeof(unsigned char[4]))
1582         {
1583                 float *end4 = dst + (size&~3)*4;
1584                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585                 {
1586                         while (dst < end4)
1587                         {
1588                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1593                                 dst += 16;
1594                                 src += 4*sizeof(unsigned char[4]);
1595                         }
1596                 }
1597                 else
1598                 {
1599                         while (dst < end4)
1600                         {
1601                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1606                                 dst += 16;
1607                                 src += 4*sizeof(unsigned char[4]);
1608                         }
1609                 }
1610         }
1611         while (dst < end)
1612         {
1613                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1615                 dst += 4;
1616                 src += stride;
1617         }
1618 }
1619
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1621 {
1622         float *end = dst + 4*size;
1623         __m128 v = _mm_loadu_ps(src);
1624         while (dst < end)
1625         {
1626                 _mm_store_ps(dst, v);
1627                 dst += 4;
1628         }
1629 }
1630 #endif
1631
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1633 {
1634 #ifdef SSE_POSSIBLE
1635         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636         __m128 m0, m1, m2, m3;
1637         float *end;
1638         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1639         {
1640                 // fast case for identity matrix
1641                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1642                 return;
1643         }
1644         end = out4f + numitems*4;
1645         m0 = _mm_loadu_ps(inmatrix16f);
1646         m1 = _mm_loadu_ps(inmatrix16f + 4);
1647         m2 = _mm_loadu_ps(inmatrix16f + 8);
1648         m3 = _mm_loadu_ps(inmatrix16f + 12);
1649         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1650         {
1651                 while (out4f < end)
1652                 {
1653                         __m128 v = _mm_loadu_ps(in4f);
1654                         _mm_store_ps(out4f,
1655                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659                         out4f += 4;
1660                         in4f += 4;
1661                 }
1662         }
1663         else
1664         {
1665                 while (out4f < end)
1666                 {
1667                         __m128 v = _mm_load_ps(in4f);
1668                         _mm_store_ps(out4f,
1669                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1673                         out4f += 4;
1674                         in4f += 4;
1675                 }
1676         }
1677 #endif
1678 }
1679
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1681 {
1682         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1683 }
1684
1685 #ifdef SSE_POSSIBLE
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1695 { \
1696         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1700 }
1701
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1703 { \
1704         __m128 p = (in); \
1705         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1709 }
1710
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1712 {
1713         int clipmask = 0xFF;
1714         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722         #define BBFRONT(k, pos) \
1723         { \
1724                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1727                 { \
1728                         __m128 proj; \
1729                         clipmask &= ~(1<<k); \
1730                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731                         minproj = _mm_min_ss(minproj, proj); \
1732                         maxproj = _mm_max_ss(maxproj, proj); \
1733                 } \
1734         }
1735         BBFRONT(0, minpos); 
1736         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1737         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1738         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1739         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1740         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1741         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1742         BBFRONT(7, maxpos);
1743         #define BBCLIP(k) \
1744         { \
1745                 if (clipmask&(1<<k)) \
1746                 { \
1747                         if (!(clipmask&(1<<(k^1)))) \
1748                         { \
1749                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752                                 minproj = _mm_min_ss(minproj, proj); \
1753                                 maxproj = _mm_max_ss(maxproj, proj); \
1754                         } \
1755                         if (!(clipmask&(1<<(k^2)))) \
1756                         { \
1757                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760                                 minproj = _mm_min_ss(minproj, proj); \
1761                                 maxproj = _mm_max_ss(maxproj, proj); \
1762                         } \
1763                         if (!(clipmask&(1<<(k^4)))) \
1764                         { \
1765                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768                                 minproj = _mm_min_ss(minproj, proj); \
1769                                 maxproj = _mm_max_ss(maxproj, proj); \
1770                         } \
1771                 } \
1772         }
1773         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780         *starty = _mm_cvttss_si32(maxproj);
1781         *endy = _mm_cvttss_si32(minproj)+1;
1782         return clipmask;
1783 }
1784         
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1786 {
1787         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788         float *end = out4f + numitems*4;
1789         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         __m128 minpos, maxpos;
1791         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1792         {
1793                 minpos = maxpos = _mm_loadu_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_loadu_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         else
1808         {
1809                 minpos = maxpos = _mm_load_ps(in4f);
1810                 while (out4f < end)
1811                 {
1812                         __m128 v = _mm_load_ps(in4f);
1813                         minpos = _mm_min_ps(minpos, v);
1814                         maxpos = _mm_max_ps(maxpos, v);
1815                         _mm_store_ps(out4f, v);
1816                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817                         _mm_store_ps(screen4f, v);
1818                         in4f += 4;
1819                         out4f += 4;
1820                         screen4f += 4;
1821                 }
1822         }
1823         if (starty && endy) 
1824         {
1825                 ALIGN(float minposf[4]);
1826                 ALIGN(float maxposf[4]);
1827                 _mm_store_ps(minposf, minpos);
1828                 _mm_store_ps(maxposf, maxpos);
1829                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1830         }
1831         return 0;
1832 }
1833
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1835 {
1836         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1838         float *end;
1839         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841         end = out4f + numitems*4;
1842         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844         m0 = _mm_loadu_ps(inmatrix16f);
1845         m1 = _mm_loadu_ps(inmatrix16f + 4);
1846         m2 = _mm_loadu_ps(inmatrix16f + 8);
1847         m3 = _mm_loadu_ps(inmatrix16f + 12);
1848         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1849         {
1850                 minpos = maxpos = _mm_loadu_ps(in4f);
1851                 while (out4f < end)
1852                 {
1853                         __m128 v = _mm_loadu_ps(in4f);
1854                         minpos = _mm_min_ps(minpos, v);
1855                         maxpos = _mm_max_ps(maxpos, v);
1856                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857                         _mm_store_ps(out4f, v);
1858                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859                         _mm_store_ps(screen4f, v);
1860                         in4f += 4;
1861                         out4f += 4;
1862                         screen4f += 4;
1863                 }
1864         }
1865         else
1866         {
1867                 minpos = maxpos = _mm_load_ps(in4f);
1868                 while (out4f < end)
1869                 {
1870                         __m128 v = _mm_load_ps(in4f);
1871                         minpos = _mm_min_ps(minpos, v);
1872                         maxpos = _mm_max_ps(maxpos, v);
1873                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874                         _mm_store_ps(out4f, v);
1875                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876                         _mm_store_ps(screen4f, v);
1877                         in4f += 4;
1878                         out4f += 4;
1879                         screen4f += 4;
1880                 }
1881         }
1882         if (starty && endy) 
1883         {
1884                 ALIGN(float minposf[4]);
1885                 ALIGN(float maxposf[4]);
1886                 _mm_store_ps(minposf, minpos);
1887                 _mm_store_ps(maxposf, maxpos);
1888                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1889         }
1890         return 0;
1891 }
1892 #endif
1893
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1895 {
1896 #ifdef SSE_POSSIBLE
1897         float *outf = dpsoftrast.post_array4f[outarray];
1898         const unsigned char *inb;
1899         int firstvertex = dpsoftrast.firstvertex;
1900         int numvertices = dpsoftrast.numvertices;
1901         int stride;
1902         switch(inarray)
1903         {
1904         case DPSOFTRAST_ARRAY_POSITION:
1905                 stride = dpsoftrast.stride_vertex;
1906                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1908                 break;
1909         case DPSOFTRAST_ARRAY_COLOR:
1910                 stride = dpsoftrast.stride_color;
1911                 if (dpsoftrast.pointer_color4f)
1912                 {
1913                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1915                 }
1916                 else if (dpsoftrast.pointer_color4ub)
1917                 {
1918                         stride = dpsoftrast.stride_color;
1919                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1921                 }
1922                 else
1923                 {
1924                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1925                 }
1926                 break;
1927         default:
1928                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1930                 {
1931                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1933                         {
1934                         case 2:
1935                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1936                                 break;
1937                         case 3:
1938                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1939                                 break;
1940                         case 4:
1941                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942                                 break;
1943                         }
1944                 }
1945                 break;
1946         }
1947         return outf;
1948 #else
1949         return NULL;
1950 #endif
1951 }
1952
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1957         return data;
1958 }
1959
1960 #if 0
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1962 {
1963 #ifdef SSE_POSSIBLE
1964         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1966         return data;
1967 #else
1968         return NULL;
1969 #endif
1970 }
1971 #endif
1972
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1974 {
1975 #ifdef SSE_POSSIBLE
1976         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1978         return data;
1979 #else
1980         return NULL;
1981 #endif
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         float wslope = triangle->w[0];
1990         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991         float endz = 1.0f / (w + wslope * startx);
1992         for (x = startx;x < endx;)
1993         {
1994                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1995                 float z = endz, dz;
1996                 if (nextsub >= endx) nextsub = endsub = endx-1;
1997                 endz = 1.0f / (w + wslope * nextsub);
1998                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1999                 for (; x <= endsub; x++, z += dz)
2000                         zf[x] = z;
2001         }
2002 }
2003
2004 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2005 {
2006         int x;
2007         int startx = span->startx;
2008         int endx = span->endx;
2009         int d[4];
2010         float a, b;
2011         unsigned char * RESTRICT pixelmask = span->pixelmask;
2012         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2013         if (!pixel)
2014                 return;
2015         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2016         // handle alphatest now (this affects depth writes too)
2017         if (thread->alphatest)
2018                 for (x = startx;x < endx;x++)
2019                         if (in4f[x*4+3] < 0.5f)
2020                                 pixelmask[x] = false;
2021         // FIXME: this does not handle bigendian
2022         switch(thread->fb_blendmode)
2023         {
2024         case DPSOFTRAST_BLENDMODE_OPAQUE:
2025                 for (x = startx;x < endx;x++)
2026                 {
2027                         if (!pixelmask[x])
2028                                 continue;
2029                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2030                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2031                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2032                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2033                         pixel[x*4+0] = d[0];
2034                         pixel[x*4+1] = d[1];
2035                         pixel[x*4+2] = d[2];
2036                         pixel[x*4+3] = d[3];
2037                 }
2038                 break;
2039         case DPSOFTRAST_BLENDMODE_ALPHA:
2040                 for (x = startx;x < endx;x++)
2041                 {
2042                         if (!pixelmask[x])
2043                                 continue;
2044                         a = in4f[x*4+3] * 255.0f;
2045                         b = 1.0f - in4f[x*4+3];
2046                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2047                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2048                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2049                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2050                         pixel[x*4+0] = d[0];
2051                         pixel[x*4+1] = d[1];
2052                         pixel[x*4+2] = d[2];
2053                         pixel[x*4+3] = d[3];
2054                 }
2055                 break;
2056         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2057                 for (x = startx;x < endx;x++)
2058                 {
2059                         if (!pixelmask[x])
2060                                 continue;
2061                         a = in4f[x*4+3] * 255.0f;
2062                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066                         pixel[x*4+0] = d[0];
2067                         pixel[x*4+1] = d[1];
2068                         pixel[x*4+2] = d[2];
2069                         pixel[x*4+3] = d[3];
2070                 }
2071                 break;
2072         case DPSOFTRAST_BLENDMODE_ADD:
2073                 for (x = startx;x < endx;x++)
2074                 {
2075                         if (!pixelmask[x])
2076                                 continue;
2077                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081                         pixel[x*4+0] = d[0];
2082                         pixel[x*4+1] = d[1];
2083                         pixel[x*4+2] = d[2];
2084                         pixel[x*4+3] = d[3];
2085                 }
2086                 break;
2087         case DPSOFTRAST_BLENDMODE_INVMOD:
2088                 for (x = startx;x < endx;x++)
2089                 {
2090                         if (!pixelmask[x])
2091                                 continue;
2092                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2093                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2094                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2095                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2096                         pixel[x*4+0] = d[0];
2097                         pixel[x*4+1] = d[1];
2098                         pixel[x*4+2] = d[2];
2099                         pixel[x*4+3] = d[3];
2100                 }
2101                 break;
2102         case DPSOFTRAST_BLENDMODE_MUL:
2103                 for (x = startx;x < endx;x++)
2104                 {
2105                         if (!pixelmask[x])
2106                                 continue;
2107                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2108                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2109                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2110                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2111                         pixel[x*4+0] = d[0];
2112                         pixel[x*4+1] = d[1];
2113                         pixel[x*4+2] = d[2];
2114                         pixel[x*4+3] = d[3];
2115                 }
2116                 break;
2117         case DPSOFTRAST_BLENDMODE_MUL2:
2118                 for (x = startx;x < endx;x++)
2119                 {
2120                         if (!pixelmask[x])
2121                                 continue;
2122                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2123                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2124                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2125                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2126                         pixel[x*4+0] = d[0];
2127                         pixel[x*4+1] = d[1];
2128                         pixel[x*4+2] = d[2];
2129                         pixel[x*4+3] = d[3];
2130                 }
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133                 for (x = startx;x < endx;x++)
2134                 {
2135                         if (!pixelmask[x])
2136                                 continue;
2137                         a = in4f[x*4+3] * -255.0f;
2138                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2139                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2140                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2141                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2142                         pixel[x*4+0] = d[0];
2143                         pixel[x*4+1] = d[1];
2144                         pixel[x*4+2] = d[2];
2145                         pixel[x*4+3] = d[3];
2146                 }
2147                 break;
2148         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2149                 for (x = startx;x < endx;x++)
2150                 {
2151                         if (!pixelmask[x])
2152                                 continue;
2153                         a = 255.0f;
2154                         b = 1.0f - in4f[x*4+3];
2155                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2156                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2157                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2158                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2159                         pixel[x*4+0] = d[0];
2160                         pixel[x*4+1] = d[1];
2161                         pixel[x*4+2] = d[2];
2162                         pixel[x*4+3] = d[3];
2163                 }
2164                 break;
2165         case DPSOFTRAST_BLENDMODE_INVADD:
2166                 for (x = startx;x < endx;x++)
2167                 {
2168                         if (!pixelmask[x])
2169                                 continue;
2170                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2171                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2172                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2173                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2174                         pixel[x*4+0] = d[0];
2175                         pixel[x*4+1] = d[1];
2176                         pixel[x*4+2] = d[2];
2177                         pixel[x*4+3] = d[3];
2178                 }
2179                 break;
2180         }
2181 }
2182
2183 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2184 {
2185 #ifdef SSE_POSSIBLE
2186         int x;
2187         int startx = span->startx;
2188         int endx = span->endx;
2189         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2190         unsigned char * RESTRICT pixelmask = span->pixelmask;
2191         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2192         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2193         if (!pixel)
2194                 return;
2195         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2196         pixeli += span->y * dpsoftrast.fb_width + span->x;
2197         // handle alphatest now (this affects depth writes too)
2198         if (thread->alphatest)
2199                 for (x = startx;x < endx;x++)
2200                         if (in4ub[x*4+3] < 0.5f)
2201                                 pixelmask[x] = false;
2202         // FIXME: this does not handle bigendian
2203         switch(thread->fb_blendmode)
2204         {
2205         case DPSOFTRAST_BLENDMODE_OPAQUE:
2206                 for (x = startx;x + 4 <= endx;)
2207                 {
2208                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2209                         {
2210                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2211                                 x += 4;
2212                         }
2213                         else
2214                         {
2215                                 if (pixelmask[x])
2216                                         pixeli[x] = ini[x];
2217                                 x++;
2218                         }
2219                 }
2220                 for (;x < endx;x++)
2221                         if (pixelmask[x])
2222                                 pixeli[x] = ini[x];
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_ALPHA:
2225         #define FINISHBLEND(blend2, blend1) \
2226                 for (x = startx;x + 1 < endx;x += 2) \
2227                 { \
2228                         __m128i src, dst; \
2229                         switch (*(const unsigned short*)&pixelmask[x]) \
2230                         { \
2231                         case 0x0101: \
2232                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2234                                 blend2; \
2235                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2236                                 continue; \
2237                         case 0x0100: \
2238                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2239                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2240                                 blend1; \
2241                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2242                                 continue; \
2243                         case 0x0001: \
2244                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2245                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2246                                 blend1; \
2247                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2248                                 continue; \
2249                         } \
2250                         break; \
2251                 } \
2252                 for(;x < endx; x++) \
2253                 { \
2254                         __m128i src, dst; \
2255                         if (!pixelmask[x]) \
2256                                 continue; \
2257                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2258                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2259                         blend1; \
2260                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2261                 }
2262
2263                 FINISHBLEND({
2264                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2266                 }, {
2267                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2268                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2269                 });
2270                 break;
2271         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2272                 FINISHBLEND({
2273                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275                 }, {
2276                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2277                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2278                 });
2279                 break;
2280         case DPSOFTRAST_BLENDMODE_ADD:
2281                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2282                 break;
2283         case DPSOFTRAST_BLENDMODE_INVMOD:
2284                 FINISHBLEND({
2285                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2286                 }, {
2287                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2288                 });
2289                 break;
2290         case DPSOFTRAST_BLENDMODE_MUL:
2291                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2292                 break;
2293         case DPSOFTRAST_BLENDMODE_MUL2:
2294                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2295                 break;
2296         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2297                 FINISHBLEND({
2298                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2299                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2300                 }, {
2301                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2302                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2303                 });
2304                 break;
2305         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2306                 FINISHBLEND({
2307                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2308                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2309                 }, {
2310                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2311                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2312                 });
2313                 break;
2314         case DPSOFTRAST_BLENDMODE_INVADD:
2315                 FINISHBLEND({
2316                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2317                 }, {
2318                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2319                 });
2320                 break;
2321         }
2322 #endif
2323 }
2324
2325 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2326 {
2327         int x;
2328         int startx = span->startx;
2329         int endx = span->endx;
2330         int flags;
2331         float c[4];
2332         float data[4];
2333         float slope[4];
2334         float tc[2], endtc[2];
2335         float tcscale[2];
2336         unsigned int tci[2];
2337         unsigned int tci1[2];
2338         unsigned int tcimin[2];
2339         unsigned int tcimax[2];
2340         int tciwrapmask[2];
2341         int tciwidth;
2342         int filter;
2343         int mip;
2344         const unsigned char * RESTRICT pixelbase;
2345         const unsigned char * RESTRICT pixel[4];
2346         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2347         // if no texture is bound, just fill it with white
2348         if (!texture)
2349         {
2350                 for (x = startx;x < endx;x++)
2351                 {
2352                         out4f[x*4+0] = 1.0f;
2353                         out4f[x*4+1] = 1.0f;
2354                         out4f[x*4+2] = 1.0f;
2355                         out4f[x*4+3] = 1.0f;
2356                 }
2357                 return;
2358         }
2359         mip = triangle->mip[texunitindex];
2360         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2361         // if this mipmap of the texture is 1 pixel, just fill it with that color
2362         if (texture->mipmap[mip][1] == 4)
2363         {
2364                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2365                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2366                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2367                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2368                 for (x = startx;x < endx;x++)
2369                 {
2370                         out4f[x*4+0] = c[0];
2371                         out4f[x*4+1] = c[1];
2372                         out4f[x*4+2] = c[2];
2373                         out4f[x*4+3] = c[3];
2374                 }
2375                 return;
2376         }
2377         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2378         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2379         flags = texture->flags;
2380         tcscale[0] = texture->mipmap[mip][2];
2381         tcscale[1] = texture->mipmap[mip][3];
2382         tciwidth = texture->mipmap[mip][2];
2383         tcimin[0] = 0;
2384         tcimin[1] = 0;
2385         tcimax[0] = texture->mipmap[mip][2]-1;
2386         tcimax[1] = texture->mipmap[mip][3]-1;
2387         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2388         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2389         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2390         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2391         for (x = startx;x < endx;)
2392         {
2393                 unsigned int subtc[2];
2394                 unsigned int substep[2];
2395                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2396                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2397                 if (nextsub >= endx)
2398                 {
2399                         nextsub = endsub = endx-1;      
2400                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2401                 }
2402                 tc[0] = endtc[0];
2403                 tc[1] = endtc[1];
2404                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2405                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2406                 substep[0] = (endtc[0] - tc[0]) * subscale;
2407                 substep[1] = (endtc[1] - tc[1]) * subscale;
2408                 subtc[0] = tc[0] * (1<<16);
2409                 subtc[1] = tc[1] * (1<<16);
2410                 if (filter)
2411                 {
2412                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2413                         {
2414                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                                 {
2416                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419                                         tci[0] = subtc[0]>>16;
2420                                         tci[1] = subtc[1]>>16;
2421                                         tci1[0] = tci[0] + 1;
2422                                         tci1[1] = tci[1] + 1;
2423                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2424                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2425                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2426                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2427                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435                                         out4f[x*4+0] = c[0];
2436                                         out4f[x*4+1] = c[1];
2437                                         out4f[x*4+2] = c[2];
2438                                         out4f[x*4+3] = c[3];
2439                                 }
2440                         }
2441                         else
2442                         {
2443                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2444                                 {
2445                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2446                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2447                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2448                                         tci[0] = subtc[0]>>16;
2449                                         tci[1] = subtc[1]>>16;
2450                                         tci1[0] = tci[0] + 1;
2451                                         tci1[1] = tci[1] + 1;
2452                                         tci[0] &= tciwrapmask[0];
2453                                         tci[1] &= tciwrapmask[1];
2454                                         tci1[0] &= tciwrapmask[0];
2455                                         tci1[1] &= tciwrapmask[1];
2456                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2457                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2458                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2459                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2460                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2461                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2462                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2463                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2464                                         out4f[x*4+0] = c[0];
2465                                         out4f[x*4+1] = c[1];
2466                                         out4f[x*4+2] = c[2];
2467                                         out4f[x*4+3] = c[3];
2468                                 }
2469                         }
2470                 }
2471                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2472                 {
2473                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2474                         {
2475                                 tci[0] = subtc[0]>>16;
2476                                 tci[1] = subtc[1]>>16;
2477                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2478                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2479                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2480                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2481                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2482                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2483                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2484                                 out4f[x*4+0] = c[0];
2485                                 out4f[x*4+1] = c[1];
2486                                 out4f[x*4+2] = c[2];
2487                                 out4f[x*4+3] = c[3];
2488                         }
2489                 }
2490                 else
2491                 {
2492                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2493                         {
2494                                 tci[0] = subtc[0]>>16;
2495                                 tci[1] = subtc[1]>>16;
2496                                 tci[0] &= tciwrapmask[0];
2497                                 tci[1] &= tciwrapmask[1];
2498                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2499                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2500                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2501                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2502                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2503                                 out4f[x*4+0] = c[0];
2504                                 out4f[x*4+1] = c[1];
2505                                 out4f[x*4+2] = c[2];
2506                                 out4f[x*4+3] = c[3];
2507                         }
2508                 }
2509         }
2510 }
2511
2512 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2513 {
2514 #ifdef SSE_POSSIBLE
2515         int x;
2516         int startx = span->startx;
2517         int endx = span->endx;
2518         int flags;
2519         __m128 data, slope, tcscale;
2520         __m128i tcsize, tcmask, tcoffset, tcmax;
2521         __m128 tc, endtc;
2522         __m128i subtc, substep, endsubtc;
2523         int filter;
2524         int mip;
2525         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2526         const unsigned char * RESTRICT pixelbase;
2527         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2528         // if no texture is bound, just fill it with white
2529         if (!texture)
2530         {
2531                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2532                 return;
2533         }
2534         mip = triangle->mip[texunitindex];
2535         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2536         // if this mipmap of the texture is 1 pixel, just fill it with that color
2537         if (texture->mipmap[mip][1] == 4)
2538         {
2539                 unsigned int k = *((const unsigned int *)pixelbase);
2540                 for (x = startx;x < endx;x++)
2541                         outi[x] = k;
2542                 return;
2543         }
2544         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2545         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2546         flags = texture->flags;
2547         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2548         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2549         tcscale = _mm_cvtepi32_ps(tcsize);
2550         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2551         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2552         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2553         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2554         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2555         tcmax = _mm_packs_epi32(tcmask, tcmask);
2556         for (x = startx;x < endx;)
2557         {
2558                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2559                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2560                 if (nextsub >= endx)
2561                 {
2562                         nextsub = endsub = endx-1;
2563                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2564                 }       
2565                 tc = endtc;
2566                 subtc = endsubtc;
2567                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2568                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2569                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2570                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2571                 substep = _mm_slli_epi32(substep, 1);
2572                 if (filter)
2573                 {
2574                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2575                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2576                         {
2577                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2578                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2579                                 {
2580                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2581                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2582                                         tci = _mm_madd_epi16(tci, tcoffset);
2583                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2584                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2585                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2586                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2587                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2588                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2589                                         fracm = _mm_srli_epi16(subtc, 1);
2590                                         pix1 = _mm_add_epi16(pix1,
2591                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2592                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2593                                         pix3 = _mm_add_epi16(pix3,
2594                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2595                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2596                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2597                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2598                                         pix2 = _mm_add_epi16(pix2,
2599                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2600                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2601                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2602                                 }
2603                                 if (x <= endsub)
2604                                 {
2605                                         const unsigned char * RESTRICT ptr1;
2606                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2607                                         tci = _mm_madd_epi16(tci, tcoffset);
2608                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2609                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2610                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2611                                         fracm = _mm_srli_epi16(subtc, 1);
2612                                         pix1 = _mm_add_epi16(pix1,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2615                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2616                                         pix1 = _mm_add_epi16(pix1,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2619                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2620                                         x++;
2621                                 }
2622                         }
2623                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2624                         {
2625                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2626                                 {
2627                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2628                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2629                                         tci = _mm_madd_epi16(tci, tcoffset);
2630                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632                                                                                         _mm_setzero_si128());
2633                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635                                                                                         _mm_setzero_si128());
2636                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2637                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2638                                         tci = _mm_madd_epi16(tci, tcoffset);
2639                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2640                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2641                                                                                         _mm_setzero_si128());
2642                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2643                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2644                                                                                         _mm_setzero_si128());
2645                                         fracm = _mm_srli_epi16(subtc, 1);
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2649                                         pix3 = _mm_add_epi16(pix3,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2651                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2652                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2653                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2654                                         pix2 = _mm_add_epi16(pix2,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2656                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2657                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2658                                 }
2659                                 if (x <= endsub)
2660                                 {
2661                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2662                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2663                                         tci = _mm_madd_epi16(tci, tcoffset);
2664                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2665                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2666                                                                                         _mm_setzero_si128());
2667                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2668                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2669                                                                                         _mm_setzero_si128());
2670                                         fracm = _mm_srli_epi16(subtc, 1);
2671                                         pix1 = _mm_add_epi16(pix1,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2674                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2675                                         pix1 = _mm_add_epi16(pix1,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2678                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2679                                         x++;
2680                                 }
2681                         }
2682                         else
2683                         {
2684                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2685                                 {
2686                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2687                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2688                                         tci = _mm_madd_epi16(tci, tcoffset);
2689                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2690                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2691                                                                                         _mm_setzero_si128());
2692                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2693                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2694                                                                                         _mm_setzero_si128());
2695                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2696                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2697                                         tci = _mm_madd_epi16(tci, tcoffset);
2698                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2699                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2700                                                                                         _mm_setzero_si128());
2701                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2702                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2703                                                                                         _mm_setzero_si128());
2704                                         fracm = _mm_srli_epi16(subtc, 1);
2705                                         pix1 = _mm_add_epi16(pix1,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2708                                         pix3 = _mm_add_epi16(pix3,
2709                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2710                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2711                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2712                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2713                                         pix2 = _mm_add_epi16(pix2,
2714                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2715                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2716                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2717                                 }
2718                                 if (x <= endsub)
2719                                 {
2720                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2721                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2722                                         tci = _mm_madd_epi16(tci, tcoffset);
2723                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2724                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2725                                                                                         _mm_setzero_si128());
2726                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2727                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2728                                                                                         _mm_setzero_si128());
2729                                         fracm = _mm_srli_epi16(subtc, 1);
2730                                         pix1 = _mm_add_epi16(pix1,
2731                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2733                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2734                                         pix1 = _mm_add_epi16(pix1,
2735                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2736                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2737                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2738                                         x++;
2739                                 }
2740                         }
2741                 }
2742                 else
2743                 {
2744                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2745                         {
2746                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2747                                 {
2748                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2749                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2750                                         tci = _mm_madd_epi16(tci, tcoffset);
2751                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2752                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2753                                 }
2754                                 if (x <= endsub)
2755                                 {
2756                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2757                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2758                                         tci = _mm_madd_epi16(tci, tcoffset);
2759                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2760                                         x++;
2761                                 }
2762                         }
2763                         else
2764                         {
2765                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2766                                 {
2767                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2768                                         tci = _mm_and_si128(tci, tcmax); 
2769                                         tci = _mm_madd_epi16(tci, tcoffset);
2770                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2771                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2772                                 }
2773                                 if (x <= endsub)
2774                                 {
2775                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2776                                         tci = _mm_and_si128(tci, tcmax); 
2777                                         tci = _mm_madd_epi16(tci, tcoffset);
2778                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2779                                         x++;
2780                                 }
2781                         }
2782                 }
2783         }
2784 #endif
2785 }
2786
2787 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2788 {
2789         // TODO: IMPLEMENT
2790         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2791 }
2792
2793 float DPSOFTRAST_SampleShadowmap(const float *vector)
2794 {
2795         // TODO: IMPLEMENT
2796         return 1.0f;
2797 }
2798
2799 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2800 {
2801         int x;
2802         int startx = span->startx;
2803         int endx = span->endx;
2804         float c[4];
2805         float data[4];
2806         float slope[4];
2807         float z;
2808         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2809         for (x = startx;x < endx;x++)
2810         {
2811                 z = zf[x];
2812                 c[0] = (data[0] + slope[0]*x) * z;
2813                 c[1] = (data[1] + slope[1]*x) * z;
2814                 c[2] = (data[2] + slope[2]*x) * z;
2815                 c[3] = (data[3] + slope[3]*x) * z;
2816                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2817                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2818                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2819                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2820         }
2821 }
2822
2823 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2824 {
2825         int x;
2826         int startx = span->startx;
2827         int endx = span->endx;
2828         float c[4];
2829         float data[4];
2830         float slope[4];
2831         float z;
2832         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2833         for (x = startx;x < endx;x++)
2834         {
2835                 z = zf[x];
2836                 c[0] = (data[0] + slope[0]*x) * z;
2837                 c[1] = (data[1] + slope[1]*x) * z;
2838                 c[2] = (data[2] + slope[2]*x) * z;
2839                 c[3] = (data[3] + slope[3]*x) * z;
2840                 out4f[x*4+0] = c[0];
2841                 out4f[x*4+1] = c[1];
2842                 out4f[x*4+2] = c[2];
2843                 out4f[x*4+3] = c[3];
2844         }
2845 }
2846
2847 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2848 {
2849         int x, startx = span->startx, endx = span->endx;
2850         float c[4], localcolor[4];
2851         localcolor[0] = subcolor[0];
2852         localcolor[1] = subcolor[1];
2853         localcolor[2] = subcolor[2];
2854         localcolor[3] = subcolor[3];
2855         for (x = startx;x < endx;x++)
2856         {
2857                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2858                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2859                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2860                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2861                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2862                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2863                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2864                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         for (x = startx;x < endx;x++)
2872         {
2873                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2874                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2875                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2876                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2877         }
2878 }
2879
2880 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2881 {
2882         int x, startx = span->startx, endx = span->endx;
2883         for (x = startx;x < endx;x++)
2884         {
2885                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2886                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2887                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2888                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2889         }
2890 }
2891
2892 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2893 {
2894         int x, startx = span->startx, endx = span->endx;
2895         float a, b;
2896         for (x = startx;x < endx;x++)
2897         {
2898                 a = 1.0f - inb4f[x*4+3];
2899                 b = inb4f[x*4+3];
2900                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2901                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2902                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2903                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2904         }
2905 }
2906
2907 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2908 {
2909         int x, startx = span->startx, endx = span->endx;
2910         float localcolor[4], ilerp, lerp;
2911         localcolor[0] = color[0];
2912         localcolor[1] = color[1];
2913         localcolor[2] = color[2];
2914         localcolor[3] = color[3];
2915         ilerp = 1.0f - localcolor[3];
2916         lerp = localcolor[3];
2917         for (x = startx;x < endx;x++)
2918         {
2919                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2920                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2921                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2922                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2923         }
2924 }
2925
2926
2927
2928 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2929 {
2930 #ifdef SSE_POSSIBLE
2931         int x;
2932         int startx = span->startx;
2933         int endx = span->endx;
2934         __m128 data, slope;
2935         __m128 mod, endmod;
2936         __m128i submod, substep, endsubmod;
2937         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2938         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2939         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2940         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2941         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2942         for (x = startx; x < endx;)
2943         {
2944                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2945                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2946                 if (nextsub >= endx)
2947                 {
2948                         nextsub = endsub = endx-1;
2949                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2950                 }
2951                 mod = endmod;
2952                 submod = endsubmod;
2953                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2954                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2955                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2956                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2957                 substep = _mm_packs_epi32(substep, substep);
2958                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2959                 {
2960                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2961                         pix = _mm_mulhi_epu16(pix, submod);
2962                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2963                 }
2964                 if (x <= endsub)
2965                 {
2966                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2967                         pix = _mm_mulhi_epu16(pix, submod);
2968                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2969                         x++;
2970                 }
2971         }
2972 #endif
2973 }
2974
2975 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangl