71bda02c2742518378c682aea5d6cb3c0ebb5c0e
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237         int shader_exactspecularmath;
238
239         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240         
241         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243
244         // DPSOFTRAST_VALIDATE_ flags
245         int validate;
246
247         // derived values (DPSOFTRAST_VALIDATE_FB)
248         int fb_colormask;
249         int fb_scissor[4];
250         ALIGN(float fb_viewportcenter[4]);
251         ALIGN(float fb_viewportscale[4]);
252
253         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254         int fb_depthfunc;
255
256         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257         int fb_blendmode;
258
259         // band boundaries
260         int miny1;
261         int maxy1;
262         int miny2;
263         int maxy2;
264
265         ATOMIC(volatile int commandoffset);
266
267         volatile bool waiting;
268         volatile bool starving;
269         void *waitcond;
270         void *drawcond;
271         void *drawmutex;
272
273         int numspans;
274         int numtriangles;
275         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 }
278 DPSOFTRAST_State_Thread);
279
280 typedef ATOMIC(struct DPSOFTRAST_State_s
281 {
282         int fb_width;
283         int fb_height;
284         unsigned int *fb_depthpixels;
285         unsigned int *fb_colorpixels[4];
286
287         int viewport[4];
288         ALIGN(float fb_viewportcenter[4]);
289         ALIGN(float fb_viewportscale[4]);
290
291         float color[4];
292         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294
295         const float *pointer_vertex3f;
296         const float *pointer_color4f;
297         const unsigned char *pointer_color4ub;
298         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int stride_vertex;
300         int stride_color;
301         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304
305         int firstvertex;
306         int numvertices;
307         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308         float *screencoord4f;
309         int drawstarty;
310         int drawendy;
311         int drawclipped;
312         
313         int shader_mode;
314         int shader_permutation;
315         int shader_exactspecularmath;
316
317         int texture_max;
318         int texture_end;
319         int texture_firstfree;
320         DPSOFTRAST_Texture *texture;
321
322         int bigendian;
323
324         // error reporting
325         const char *errorstring;
326
327         bool usethreads;
328         int interlace;
329         int numthreads;
330         DPSOFTRAST_State_Thread *threads;
331
332         ATOMIC(volatile int drawcommand);
333
334         DPSOFTRAST_State_Command_Pool commandpool;
335 }
336 DPSOFTRAST_State);
337
338 DPSOFTRAST_State dpsoftrast;
339
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
345
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
347 {
348         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350         fb_viewportcenter[3] = 0.5f;
351         fb_viewportcenter[0] = 0.0f;
352         fb_viewportscale[1] = 0.5f * viewport[2];
353         fb_viewportscale[2] = -0.5f * viewport[3];
354         fb_viewportscale[3] = 0.5f;
355         fb_viewportscale[0] = 1.0f;
356 }
357
358 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
359 {
360         if (dpsoftrast.interlace)
361         {
362                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
363                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
364                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
365                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
366         }
367         else
368         {
369                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
370                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
371         }
372 }
373
374 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
375 {
376         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
377         // and viewport projection values
378         int x1, x2;
379         int y1, y2;
380         x1 = thread->scissor[0];
381         x2 = thread->scissor[0] + thread->scissor[2];
382         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
383         y2 = dpsoftrast.fb_height - thread->scissor[1];
384         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
385         if (x1 < 0) x1 = 0;
386         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
387         if (y1 < 0) y1 = 0;
388         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
389         thread->fb_scissor[0] = x1;
390         thread->fb_scissor[1] = y1;
391         thread->fb_scissor[2] = x2 - x1;
392         thread->fb_scissor[3] = y2 - y1;
393
394         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
395         DPSOFTRAST_RecalcThread(thread);
396 }
397
398 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
399 {
400         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
401 }
402
403 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
404 {
405         if (thread->blendsubtract)
406         {
407                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
408                 {
409                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
410                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
411                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
412                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
413                 }
414         }
415         else
416         {       
417                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
418                 {
419                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
420                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
421                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
422                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
423                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
424                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
425                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
426                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
427                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
428                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
429                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
430                 }
431         }
432 }
433
434 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
435
436 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
437 {
438         mask &= thread->validate;
439         if (!mask)
440                 return;
441         if (mask & DPSOFTRAST_VALIDATE_FB)
442         {
443                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
444                 DPSOFTRAST_RecalcFB(thread);
445         }
446         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
447         {
448                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
449                 DPSOFTRAST_RecalcDepthFunc(thread);
450         }
451         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
452         {
453                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
454                 DPSOFTRAST_RecalcBlendFunc(thread);
455         }
456 }
457
458 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
459 {
460         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
461                 return &dpsoftrast.texture[index];
462         return NULL;
463 }
464
465 static void DPSOFTRAST_Texture_Grow(void)
466 {
467         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
468         DPSOFTRAST_State_Thread *thread;
469         int i;
470         int j;
471         DPSOFTRAST_Flush();
472         // expand texture array as needed
473         if (dpsoftrast.texture_max < 1024)
474                 dpsoftrast.texture_max = 1024;
475         else
476                 dpsoftrast.texture_max *= 2;
477         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
478         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
479                 if (dpsoftrast.texbound[i])
480                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
481         for (j = 0; j < dpsoftrast.numthreads; j++)
482         {
483                 thread = &dpsoftrast.threads[j];
484                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
485                         if (thread->texbound[i])
486                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
487         }
488 }
489
490 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
491 {
492         int w;
493         int h;
494         int d;
495         int size;
496         int s;
497         int texnum;
498         int mipmaps;
499         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
500         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
501         DPSOFTRAST_Texture *texture;
502         if (width*height*depth < 1)
503         {
504                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
505                 return 0;
506         }
507         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
510                 return 0;
511         }
512         switch(texformat)
513         {
514         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
515         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
516         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
517                 break;
518         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
519                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
520                 {
521                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
522                         return 0;
523                 }
524                 if (depth != 1)
525                 {
526                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
527                         return 0;
528                 }
529                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
530                 {
531                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
532                         return 0;
533                 }
534                 break;
535         }
536         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
537         {
538                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
539                 return 0;
540         }
541         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
544                 return 0;
545         }
546         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
549                 return 0;
550         }
551         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552         {
553                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
554                 return 0;
555         }
556         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
557         {
558                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
559                 return 0;
560         }
561         // find first empty slot in texture array
562         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
563                 if (!dpsoftrast.texture[texnum].bytes)
564                         break;
565         dpsoftrast.texture_firstfree = texnum + 1;
566         if (dpsoftrast.texture_max <= texnum)
567                 DPSOFTRAST_Texture_Grow();
568         if (dpsoftrast.texture_end <= texnum)
569                 dpsoftrast.texture_end = texnum + 1;
570         texture = &dpsoftrast.texture[texnum];
571         memset(texture, 0, sizeof(*texture));
572         texture->flags = flags;
573         texture->width = width;
574         texture->height = height;
575         texture->depth = depth;
576         texture->sides = sides;
577         texture->binds = 0;
578         w = width;
579         h = height;
580         d = depth;
581         size = 0;
582         mipmaps = 0;
583         w = width;
584         h = height;
585         d = depth;
586         for (;;)
587         {
588                 s = w * h * d * sides * 4;
589                 texture->mipmap[mipmaps][0] = size;
590                 texture->mipmap[mipmaps][1] = s;
591                 texture->mipmap[mipmaps][2] = w;
592                 texture->mipmap[mipmaps][3] = h;
593                 texture->mipmap[mipmaps][4] = d;
594                 size += s;
595                 mipmaps++;
596                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
597                         break;
598                 if (w > 1) w >>= 1;
599                 if (h > 1) h >>= 1;
600                 if (d > 1) d >>= 1;
601         }
602         texture->mipmaps = mipmaps;
603         texture->size = size;
604
605         // allocate the pixels now
606         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
607
608         return texnum;
609 }
610 void DPSOFTRAST_Texture_Free(int index)
611 {
612         DPSOFTRAST_Texture *texture;
613         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
614         if (texture->binds)
615                 DPSOFTRAST_Flush();
616         if (texture->bytes)
617                 MM_FREE(texture->bytes);
618         texture->bytes = NULL;
619         memset(texture, 0, sizeof(*texture));
620         // adjust the free range and used range
621         if (dpsoftrast.texture_firstfree > index)
622                 dpsoftrast.texture_firstfree = index;
623         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
624                 dpsoftrast.texture_end--;
625 }
626 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
627 {
628         int i, x, y, z, w, layer0, layer1, row0, row1;
629         unsigned char *o, *i0, *i1, *i2, *i3;
630         DPSOFTRAST_Texture *texture;
631         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
632         if (texture->mipmaps <= 1)
633                 return;
634         for (i = 1;i < texture->mipmaps;i++)
635         {
636                 for (z = 0;z < texture->mipmap[i][4];z++)
637                 {
638                         layer0 = z*2;
639                         layer1 = z*2+1;
640                         if (layer1 >= texture->mipmap[i-1][4])
641                                 layer1 = texture->mipmap[i-1][4]-1;
642                         for (y = 0;y < texture->mipmap[i][3];y++)
643                         {
644                                 row0 = y*2;
645                                 row1 = y*2+1;
646                                 if (row1 >= texture->mipmap[i-1][3])
647                                         row1 = texture->mipmap[i-1][3]-1;
648                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
649                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
650                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
651                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
652                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
653                                 w = texture->mipmap[i][2];
654                                 if (layer1 > layer0)
655                                 {
656                                         if (texture->mipmap[i-1][2] > 1)
657                                         {
658                                                 // average 3D texture
659                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
660                                                 {
661                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
662                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
663                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
664                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
665                                                 }
666                                         }
667                                         else
668                                         {
669                                                 // average 3D mipmap with parent width == 1
670                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
671                                                 {
672                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
673                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
674                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
675                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
676                                                 }
677                                         }
678                                 }
679                                 else
680                                 {
681                                         if (texture->mipmap[i-1][2] > 1)
682                                         {
683                                                 // average 2D texture (common case)
684                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
685                                                 {
686                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
687                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
688                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
689                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
690                                                 }
691                                         }
692                                         else
693                                         {
694                                                 // 2D texture with parent width == 1
695                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
696                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
697                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
698                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
699                                         }
700                                 }
701                         }
702                 }
703         }
704 }
705 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
706 {
707         DPSOFTRAST_Texture *texture;
708         unsigned char *dst;
709         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
710         if (texture->binds)
711                 DPSOFTRAST_Flush();
712         if (pixels)
713         {
714                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
715                 while (blockheight > 0)
716                 {
717                         memcpy(dst, pixels, blockwidth * 4);
718                         pixels += blockwidth * 4;
719                         dst += texture->mipmap[0][2] * 4;
720                         blockheight--;
721                 }
722         }
723         DPSOFTRAST_Texture_CalculateMipmaps(index);
724 }
725 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
726 {
727         DPSOFTRAST_Texture *texture;
728         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
729         if (texture->binds)
730                 DPSOFTRAST_Flush();
731         if (pixels)
732                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
733         DPSOFTRAST_Texture_CalculateMipmaps(index);
734 }
735 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
736 {
737         DPSOFTRAST_Texture *texture;
738         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
739         return texture->mipmap[mip][2];
740 }
741 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
742 {
743         DPSOFTRAST_Texture *texture;
744         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
745         return texture->mipmap[mip][3];
746 }
747 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
751         return texture->mipmap[mip][4];
752 }
753 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
754 {
755         DPSOFTRAST_Texture *texture;
756         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
757         if (texture->binds)
758                 DPSOFTRAST_Flush();
759         return texture->bytes + texture->mipmap[mip][0];
760 }
761 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
765         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
766         {
767                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
768                 return;
769         }
770         if (texture->binds)
771                 DPSOFTRAST_Flush();
772         texture->filter = filter;
773 }
774
775 static void DPSOFTRAST_Draw_FlushThreads(void);
776
777 static void DPSOFTRAST_Draw_SyncCommands(void)
778 {
779         if(dpsoftrast.usethreads) MEMORY_BARRIER;
780         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
781 }
782
783 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
784 {
785         DPSOFTRAST_State_Thread *thread;
786         int i;
787         int freecommand = dpsoftrast.commandpool.freecommand;
788         int usedcommands = dpsoftrast.commandpool.usedcommands;
789         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
790                 return;
791         DPSOFTRAST_Draw_SyncCommands();
792         for(;;)
793         {
794                 int waitindex = -1;
795                 int commandoffset;
796                 usedcommands = 0;
797                 for (i = 0; i < dpsoftrast.numthreads; i++)
798                 {
799                         thread = &dpsoftrast.threads[i]; 
800                         commandoffset = freecommand - thread->commandoffset;
801                         if (commandoffset < 0)
802                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
803                         if (commandoffset > usedcommands)
804                         {
805                                 waitindex = i;
806                                 usedcommands = commandoffset;
807                         }
808                 }
809                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
810                         break;
811                 thread = &dpsoftrast.threads[waitindex];
812                 Thread_LockMutex(thread->drawmutex);
813                 if (thread->commandoffset != dpsoftrast.drawcommand)
814                 {
815                         thread->waiting = true;
816                         if (thread->starving) Thread_CondSignal(thread->drawcond);
817                         Thread_CondWait(thread->waitcond, thread->drawmutex);
818                         thread->waiting = false;
819                 }
820                 Thread_UnlockMutex(thread->drawmutex);
821         }
822         dpsoftrast.commandpool.usedcommands = usedcommands;
823 }
824
825 #define DPSOFTRAST_ALIGNCOMMAND(size) \
826         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
827 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
828         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
829
830 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
831 {
832         DPSOFTRAST_Command *command;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         int extra = sizeof(DPSOFTRAST_Command);
836         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
837                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
839         {
840                 if (dpsoftrast.usethreads)
841                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
842                 else
843                         DPSOFTRAST_Draw_FlushThreads();
844                 freecommand = dpsoftrast.commandpool.freecommand;
845                 usedcommands = dpsoftrast.commandpool.usedcommands;
846         }
847         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
848         {
849                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850                 command->opcode = DPSOFTRAST_OPCODE_Reset;
851                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
852                 freecommand = 0;
853         }
854         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855         command->opcode = opcode;
856         command->commandsize = size;
857         freecommand += size;
858         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
859                 freecommand = 0;
860         dpsoftrast.commandpool.freecommand = freecommand;
861         dpsoftrast.commandpool.usedcommands = usedcommands + size;
862         return command;
863 }
864
865 static void DPSOFTRAST_UndoCommand(int size)
866 {
867         int freecommand = dpsoftrast.commandpool.freecommand;
868         int usedcommands = dpsoftrast.commandpool.usedcommands;
869         freecommand -= size;
870         if (freecommand < 0)
871                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
872         usedcommands -= size;
873         dpsoftrast.commandpool.freecommand = freecommand;
874         dpsoftrast.commandpool.usedcommands = usedcommands;
875 }
876                 
877 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
878 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
879 {
880         thread->viewport[0] = command->x;
881         thread->viewport[1] = command->y;
882         thread->viewport[2] = command->width;
883         thread->viewport[3] = command->height;
884         thread->validate |= DPSOFTRAST_VALIDATE_FB;
885 }
886 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
887 {
888         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
889         command->x = x;
890         command->y = y;
891         command->width = width;
892         command->height = height;
893
894         dpsoftrast.viewport[0] = x;
895         dpsoftrast.viewport[1] = y;
896         dpsoftrast.viewport[2] = width;
897         dpsoftrast.viewport[3] = height;
898         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
899 }
900
901 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
902 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
903 {
904         int i, x1, y1, x2, y2, w, h, x, y;
905         int miny1, maxy1, miny2, maxy2;
906         int bandy;
907         unsigned int *p;
908         unsigned int c;
909         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
910         miny1 = thread->miny1;
911         maxy1 = thread->maxy1;
912         miny2 = thread->miny2;
913         maxy2 = thread->maxy2;
914         x1 = thread->fb_scissor[0];
915         y1 = thread->fb_scissor[1];
916         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
917         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
918         if (y1 < miny1) y1 = miny1;
919         if (y2 > maxy2) y2 = maxy2;
920         w = x2 - x1;
921         h = y2 - y1;
922         if (w < 1 || h < 1)
923                 return;
924         // FIXME: honor fb_colormask?
925         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
926         for (i = 0;i < 4;i++)
927         {
928                 if (!dpsoftrast.fb_colorpixels[i])
929                         continue;
930                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
931                 for (;y < bandy;y++)
932                 {
933                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
934                         for (x = x1;x < x2;x++)
935                                 p[x] = c;
936                 }
937         }
938 }
939 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
940 {
941         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
942         command->r = r;
943         command->g = g;
944         command->b = b;
945         command->a = a;
946 }
947
948 DEFCOMMAND(3, ClearDepth, float depth;)
949 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
950 {
951         int x1, y1, x2, y2, w, h, x, y;
952         int miny1, maxy1, miny2, maxy2;
953         int bandy;
954         unsigned int *p;
955         unsigned int c;
956         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957         miny1 = thread->miny1;
958         maxy1 = thread->maxy1;
959         miny2 = thread->miny2;
960         maxy2 = thread->maxy2;
961         x1 = thread->fb_scissor[0];
962         y1 = thread->fb_scissor[1];
963         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
964         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
965         if (y1 < miny1) y1 = miny1;
966         if (y2 > maxy2) y2 = maxy2;
967         w = x2 - x1;
968         h = y2 - y1;
969         if (w < 1 || h < 1)
970                 return;
971         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
972         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
973         for (;y < bandy;y++)
974         {
975                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
976                 for (x = x1;x < x2;x++)
977                         p[x] = c;
978         }
979 }
980 void DPSOFTRAST_ClearDepth(float d)
981 {
982         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
983         command->depth = d;
984 }
985
986 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
987 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
988 {
989         thread->colormask[0] = command->r != 0;
990         thread->colormask[1] = command->g != 0;
991         thread->colormask[2] = command->b != 0;
992         thread->colormask[3] = command->a != 0;
993         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
994 }
995 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
996 {
997         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
998         command->r = r;
999         command->g = g;
1000         command->b = b;
1001         command->a = a;
1002 }
1003
1004 DEFCOMMAND(5, DepthTest, int enable;)
1005 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1006 {
1007         thread->depthtest = command->enable;
1008         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1009 }
1010 void DPSOFTRAST_DepthTest(int enable)
1011 {
1012         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(6, ScissorTest, int enable;)
1017 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1018 {
1019         thread->scissortest = command->enable;
1020         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1021 }
1022 void DPSOFTRAST_ScissorTest(int enable)
1023 {
1024         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1025         command->enable = enable;
1026 }
1027
1028 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1029 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1030 {
1031         thread->scissor[0] = command->x;
1032         thread->scissor[1] = command->y;
1033         thread->scissor[2] = command->width;
1034         thread->scissor[3] = command->height;
1035         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1036 }
1037 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1038 {
1039         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1040         command->x = x;
1041         command->y = y;
1042         command->width = width;
1043         command->height = height;
1044 }
1045
1046 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1047 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1048 {
1049         thread->blendfunc[0] = command->sfactor;
1050         thread->blendfunc[1] = command->dfactor;
1051         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1052 }
1053 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1054 {
1055         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1056         command->sfactor = sfactor;
1057         command->dfactor = dfactor;
1058 }
1059
1060 DEFCOMMAND(9, BlendSubtract, int enable;)
1061 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1062 {
1063         thread->blendsubtract = command->enable;
1064         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1065 }
1066 void DPSOFTRAST_BlendSubtract(int enable)
1067 {
1068         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1069         command->enable = enable;
1070 }
1071
1072 DEFCOMMAND(10, DepthMask, int enable;)
1073 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1074 {
1075         thread->depthmask = command->enable;
1076 }
1077 void DPSOFTRAST_DepthMask(int enable)
1078 {
1079         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1080         command->enable = enable;
1081 }
1082
1083 DEFCOMMAND(11, DepthFunc, int func;)
1084 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1085 {
1086         thread->depthfunc = command->func;
1087 }
1088 void DPSOFTRAST_DepthFunc(int func)
1089 {
1090         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1091         command->func = func;
1092 }
1093
1094 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1095 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1096 {
1097         thread->depthrange[0] = command->nearval;
1098         thread->depthrange[1] = command->farval;
1099 }
1100 void DPSOFTRAST_DepthRange(float nearval, float farval)
1101 {
1102         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1103         command->nearval = nearval;
1104         command->farval = farval;
1105 }
1106
1107 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1108 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1109 {
1110         thread->polygonoffset[0] = command->alongnormal;
1111         thread->polygonoffset[1] = command->intoview;
1112 }
1113 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1114 {
1115         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1116         command->alongnormal = alongnormal;
1117         command->intoview = intoview;
1118 }
1119
1120 DEFCOMMAND(14, CullFace, int mode;)
1121 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1122 {
1123         thread->cullface = command->mode;
1124 }
1125 void DPSOFTRAST_CullFace(int mode)
1126 {
1127         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1128         command->mode = mode;
1129 }
1130
1131 DEFCOMMAND(15, AlphaTest, int enable;)
1132 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1133 {
1134         thread->alphatest = command->enable;
1135 }
1136 void DPSOFTRAST_AlphaTest(int enable)
1137 {
1138         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1139         command->enable = enable;
1140 }
1141
1142 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1143 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1144 {
1145         thread->alphafunc = command->func;
1146         thread->alphavalue = command->ref;
1147 }
1148 void DPSOFTRAST_AlphaFunc(int func, float ref)
1149 {
1150         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1151         command->func = func;
1152         command->ref = ref;
1153 }
1154
1155 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1156 {
1157         dpsoftrast.color[0] = r;
1158         dpsoftrast.color[1] = g;
1159         dpsoftrast.color[2] = b;
1160         dpsoftrast.color[3] = a;
1161 }
1162
1163 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1164 {
1165         int outstride = blockwidth * 4;
1166         int instride = dpsoftrast.fb_width * 4;
1167         int bx1 = blockx;
1168         int by1 = blocky;
1169         int bx2 = blockx + blockwidth;
1170         int by2 = blocky + blockheight;
1171         int bw;
1172         int x;
1173         int y;
1174         unsigned char *inpixels;
1175         unsigned char *b;
1176         unsigned char *o;
1177         DPSOFTRAST_Flush();
1178         if (bx1 < 0) bx1 = 0;
1179         if (by1 < 0) by1 = 0;
1180         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1181         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1182         bw = bx2 - bx1;
1183         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1184         if (dpsoftrast.bigendian)
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         for (x = bx1;x < bx2;x++)
1191                         {
1192                                 o[0] = b[3];
1193                                 o[1] = b[2];
1194                                 o[2] = b[1];
1195                                 o[3] = b[0];
1196                                 o += 4;
1197                                 b += 4;
1198                         }
1199                 }
1200         }
1201         else
1202         {
1203                 for (y = by1;y < by2;y++)
1204                 {
1205                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1206                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1207                         memcpy(o, b, bw*4);
1208                 }
1209         }
1210
1211 }
1212 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1213 {
1214         int tx1 = tx;
1215         int ty1 = ty;
1216         int tx2 = tx + width;
1217         int ty2 = ty + height;
1218         int sx1 = sx;
1219         int sy1 = sy;
1220         int sx2 = sx + width;
1221         int sy2 = sy + height;
1222         int swidth;
1223         int sheight;
1224         int twidth;
1225         int theight;
1226         int sw;
1227         int sh;
1228         int tw;
1229         int th;
1230         int y;
1231         unsigned int *spixels;
1232         unsigned int *tpixels;
1233         DPSOFTRAST_Texture *texture;
1234         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1235         if (mip < 0 || mip >= texture->mipmaps) return;
1236         DPSOFTRAST_Flush();
1237         spixels = dpsoftrast.fb_colorpixels[0];
1238         swidth = dpsoftrast.fb_width;
1239         sheight = dpsoftrast.fb_height;
1240         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1241         twidth = texture->mipmap[mip][2];
1242         theight = texture->mipmap[mip][3];
1243         if (tx1 < 0) tx1 = 0;
1244         if (ty1 < 0) ty1 = 0;
1245         if (tx2 > twidth) tx2 = twidth;
1246         if (ty2 > theight) ty2 = theight;
1247         if (sx1 < 0) sx1 = 0;
1248         if (sy1 < 0) sy1 = 0;
1249         if (sx2 > swidth) sx2 = swidth;
1250         if (sy2 > sheight) sy2 = sheight;
1251         tw = tx2 - tx1;
1252         th = ty2 - ty1;
1253         sw = sx2 - sx1;
1254         sh = sy2 - sy1;
1255         if (tw > sw) tw = sw;
1256         if (th > sh) th = sh;
1257         if (tw < 1 || th < 1)
1258                 return;
1259         sy1 = sheight - 1 - sy1;
1260         for (y = 0;y < th;y++)
1261                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1262         if (texture->mipmaps > 1)
1263                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1264 }
1265
1266 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1267 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1268 {
1269         if (thread->texbound[command->unitnum])
1270                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1271         thread->texbound[command->unitnum] = command->texture;
1272 }
1273 void DPSOFTRAST_SetTexture(int unitnum, int index)
1274 {
1275         DPSOFTRAST_Command_SetTexture *command;
1276         DPSOFTRAST_Texture *texture;
1277         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1278         {
1279                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1280                 return;
1281         }
1282         texture = DPSOFTRAST_Texture_GetByIndex(index);
1283         if (index && !texture)
1284         {
1285                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1286                 return;
1287         }
1288
1289         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1290         command->unitnum = unitnum;
1291         command->texture = texture;
1292
1293         dpsoftrast.texbound[unitnum] = texture;
1294         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1295 }
1296
1297 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1298 {
1299         dpsoftrast.pointer_vertex3f = vertex3f;
1300         dpsoftrast.stride_vertex = stride;
1301 }
1302 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1303 {
1304         dpsoftrast.pointer_color4f = color4f;
1305         dpsoftrast.pointer_color4ub = NULL;
1306         dpsoftrast.stride_color = stride;
1307 }
1308 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1309 {
1310         dpsoftrast.pointer_color4f = NULL;
1311         dpsoftrast.pointer_color4ub = color4ub;
1312         dpsoftrast.stride_color = stride;
1313 }
1314 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1315 {
1316         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1317         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1318         dpsoftrast.stride_texcoord[unitnum] = stride;
1319 }
1320
1321 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1322 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1323 {
1324         thread->shader_mode = command->mode;
1325         thread->shader_permutation = command->permutation;
1326         thread->shader_exactspecularmath = command->exactspecularmath;
1327 }
1328 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1329 {
1330         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1331         command->mode = mode;
1332         command->permutation = permutation;
1333         command->exactspecularmath = exactspecularmath;
1334
1335         dpsoftrast.shader_mode = mode;
1336         dpsoftrast.shader_permutation = permutation;
1337         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1338 }
1339
1340 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1341 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1342 {
1343         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1344 }
1345 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1346 {
1347         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348         command->index = index;
1349         command->val[0] = v0;
1350         command->val[1] = v1;
1351         command->val[2] = v2;
1352         command->val[3] = v3;
1353
1354         dpsoftrast.uniform4f[index*4+0] = v0;
1355         dpsoftrast.uniform4f[index*4+1] = v1;
1356         dpsoftrast.uniform4f[index*4+2] = v2;
1357         dpsoftrast.uniform4f[index*4+3] = v3;
1358 }
1359 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1360 {
1361         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362         command->index = index;
1363         memcpy(command->val, v, sizeof(command->val));
1364
1365         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1366 }
1367
1368 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1369 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1370 {
1371         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1372 }
1373 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1374 {
1375 #ifdef SSE2_PRESENT
1376         int i, index;
1377         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1378         {
1379                 __m128 m0, m1, m2, m3;
1380                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1381                 command->index = (DPSOFTRAST_UNIFORM)index;
1382                 if (((size_t)v)&(ALIGN_SIZE-1))
1383                 {
1384                         m0 = _mm_loadu_ps(v);
1385                         m1 = _mm_loadu_ps(v+4);
1386                         m2 = _mm_loadu_ps(v+8);
1387                         m3 = _mm_loadu_ps(v+12);
1388                 }
1389                 else
1390                 {
1391                         m0 = _mm_load_ps(v);
1392                         m1 = _mm_load_ps(v+4);
1393                         m2 = _mm_load_ps(v+8);
1394                         m3 = _mm_load_ps(v+12);
1395                 }
1396                 if (transpose)
1397                 {
1398                         __m128 t0, t1, t2, t3;
1399                         t0 = _mm_unpacklo_ps(m0, m1);
1400                         t1 = _mm_unpacklo_ps(m2, m3);
1401                         t2 = _mm_unpackhi_ps(m0, m1);
1402                         t3 = _mm_unpackhi_ps(m2, m3);
1403                         m0 = _mm_movelh_ps(t0, t1);
1404                         m1 = _mm_movehl_ps(t1, t0);
1405                         m2 = _mm_movelh_ps(t2, t3);
1406                         m3 = _mm_movehl_ps(t3, t2);                     
1407                 }
1408                 _mm_store_ps(command->val, m0);
1409                 _mm_store_ps(command->val+4, m1);
1410                 _mm_store_ps(command->val+8, m2);
1411                 _mm_store_ps(command->val+12, m3);
1412                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1413                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1414                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1415                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1416         }
1417 #endif
1418 }
1419
1420 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1421 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1422 {
1423         thread->uniform1i[command->index] = command->val;
1424 }
1425 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1426 {
1427         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1428         command->index = index;
1429         command->val = i0;
1430
1431         dpsoftrast.uniform1i[command->index] = i0;
1432 }
1433
1434 #ifdef SSE2_PRESENT
1435 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1439         {
1440                 while (dst < end)
1441                 {
1442                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1443                         dst += 4;
1444                         src += stride;
1445                 }
1446         }
1447         else
1448         {
1449                 while (dst < end)
1450                 {
1451                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1452                         dst += 4;
1453                         src += stride;
1454                 }
1455         }
1456 }
1457
1458 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1459 {
1460         float *end = dst + size*4;
1461         if (stride == sizeof(float[3]))
1462         {
1463                 float *end4 = dst + (size&~3)*4;        
1464                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1465                 {
1466                         while (dst < end4)
1467                         {
1468                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1469                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1481                                 dst += 16;
1482                                 src += 4*sizeof(float[3]);
1483                         }
1484                 }
1485                 else
1486                 {
1487                         while (dst < end4)
1488                         {
1489                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1490                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1491                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1492                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1493                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1497                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1501                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1502                                 dst += 16;
1503                                 src += 4*sizeof(float[3]);
1504                         }
1505                 }
1506         }
1507         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_loadu_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520         else
1521         {
1522                 while (dst < end)
1523                 {
1524                         __m128 v = _mm_load_ps((const float *)src);
1525                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1526                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1527                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1528                         _mm_store_ps(dst, v);
1529                         dst += 4;
1530                         src += stride;
1531                 }
1532         }
1533 }
1534
1535 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1536 {
1537         float *end = dst + size*4;
1538         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1539         if (stride == sizeof(float[2]))
1540         {
1541                 float *end2 = dst + (size&~1)*4;
1542                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1543                 {
1544                         while (dst < end2)
1545                         {
1546                                 __m128 v = _mm_loadu_ps((const float *)src);
1547                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1549                                 dst += 8;
1550                                 src += 2*sizeof(float[2]);
1551                         }
1552                 }
1553                 else
1554                 {
1555                         while (dst < end2)
1556                         {
1557                                 __m128 v = _mm_load_ps((const float *)src);
1558                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1559                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1560                                 dst += 8;
1561                                 src += 2*sizeof(float[2]);
1562                         }
1563                 }
1564         }
1565         while (dst < end)
1566         {
1567                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1568                 dst += 4;
1569                 src += stride;
1570         }
1571 }
1572
1573 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 {
1575         float *end = dst + size*4;
1576         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1577         if (stride == sizeof(unsigned char[4]))
1578         {
1579                 float *end4 = dst + (size&~3)*4;
1580                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593                 else
1594                 {
1595                         while (dst < end4)
1596                         {
1597                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1598                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1599                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1600                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1601                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1602                                 dst += 16;
1603                                 src += 4*sizeof(unsigned char[4]);
1604                         }
1605                 }
1606         }
1607         while (dst < end)
1608         {
1609                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1610                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1611                 dst += 4;
1612                 src += stride;
1613         }
1614 }
1615
1616 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1617 {
1618         float *end = dst + 4*size;
1619         __m128 v = _mm_loadu_ps(src);
1620         while (dst < end)
1621         {
1622                 _mm_store_ps(dst, v);
1623                 dst += 4;
1624         }
1625 }
1626 #endif
1627
1628 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1629 {
1630 #ifdef SSE2_PRESENT
1631         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1632         __m128 m0, m1, m2, m3;
1633         float *end;
1634         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1635         {
1636                 // fast case for identity matrix
1637                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1638                 return;
1639         }
1640         end = out4f + numitems*4;
1641         m0 = _mm_loadu_ps(inmatrix16f);
1642         m1 = _mm_loadu_ps(inmatrix16f + 4);
1643         m2 = _mm_loadu_ps(inmatrix16f + 8);
1644         m3 = _mm_loadu_ps(inmatrix16f + 12);
1645         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1646         {
1647                 while (out4f < end)
1648                 {
1649                         __m128 v = _mm_loadu_ps(in4f);
1650                         _mm_store_ps(out4f,
1651                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655                         out4f += 4;
1656                         in4f += 4;
1657                 }
1658         }
1659         else
1660         {
1661                 while (out4f < end)
1662                 {
1663                         __m128 v = _mm_load_ps(in4f);
1664                         _mm_store_ps(out4f,
1665                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1666                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1667                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1668                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1669                         out4f += 4;
1670                         in4f += 4;
1671                 }
1672         }
1673 #endif
1674 }
1675
1676 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1677 {
1678         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 }
1680
1681 #ifdef SSE2_PRESENT
1682 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1683 { \
1684         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1688 }
1689
1690 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1691 { \
1692         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1693         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1694         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1695         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1696 }
1697
1698 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1699 { \
1700         __m128 p = (in); \
1701         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1702                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1703                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1704                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1705 }
1706
1707 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1708 {
1709         int clipmask = 0xFF;
1710         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1711         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1712         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1713         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1714         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1715         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1716         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1717         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1718         #define BBFRONT(k, pos) \
1719         { \
1720                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1721                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1722                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1723                 { \
1724                         __m128 proj; \
1725                         clipmask &= ~(1<<k); \
1726                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1727                         minproj = _mm_min_ss(minproj, proj); \
1728                         maxproj = _mm_max_ss(maxproj, proj); \
1729                 } \
1730         }
1731         BBFRONT(0, minpos); 
1732         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1733         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1734         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1735         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1736         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1737         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1738         BBFRONT(7, maxpos);
1739         #define BBCLIP(k) \
1740         { \
1741                 if (clipmask&(1<<k)) \
1742                 { \
1743                         if (!(clipmask&(1<<(k^1)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                         if (!(clipmask&(1<<(k^2)))) \
1752                         { \
1753                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1754                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1755                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1756                                 minproj = _mm_min_ss(minproj, proj); \
1757                                 maxproj = _mm_max_ss(maxproj, proj); \
1758                         } \
1759                         if (!(clipmask&(1<<(k^4)))) \
1760                         { \
1761                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1762                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1763                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1764                                 minproj = _mm_min_ss(minproj, proj); \
1765                                 maxproj = _mm_max_ss(maxproj, proj); \
1766                         } \
1767                 } \
1768         }
1769         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1770         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1771         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1772         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1773         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1774         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1775         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1776         *starty = _mm_cvttss_si32(maxproj);
1777         *endy = _mm_cvttss_si32(minproj)+1;
1778         return clipmask;
1779 }
1780         
1781 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1782 {
1783         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1784         float *end = out4f + numitems*4;
1785         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1786         __m128 minpos, maxpos;
1787         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1788         {
1789                 minpos = maxpos = _mm_loadu_ps(in4f);
1790                 while (out4f < end)
1791                 {
1792                         __m128 v = _mm_loadu_ps(in4f);
1793                         minpos = _mm_min_ps(minpos, v);
1794                         maxpos = _mm_max_ps(maxpos, v);
1795                         _mm_store_ps(out4f, v);
1796                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1797                         _mm_store_ps(screen4f, v);
1798                         in4f += 4;
1799                         out4f += 4;
1800                         screen4f += 4;
1801                 }
1802         }
1803         else
1804         {
1805                 minpos = maxpos = _mm_load_ps(in4f);
1806                 while (out4f < end)
1807                 {
1808                         __m128 v = _mm_load_ps(in4f);
1809                         minpos = _mm_min_ps(minpos, v);
1810                         maxpos = _mm_max_ps(maxpos, v);
1811                         _mm_store_ps(out4f, v);
1812                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1813                         _mm_store_ps(screen4f, v);
1814                         in4f += 4;
1815                         out4f += 4;
1816                         screen4f += 4;
1817                 }
1818         }
1819         if (starty && endy) 
1820         {
1821                 ALIGN(float minposf[4]);
1822                 ALIGN(float maxposf[4]);
1823                 _mm_store_ps(minposf, minpos);
1824                 _mm_store_ps(maxposf, maxpos);
1825                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1826         }
1827         return 0;
1828 }
1829
1830 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1831 {
1832         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1833         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1834         float *end;
1835         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1836                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1837         end = out4f + numitems*4;
1838         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1839         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1840         m0 = _mm_loadu_ps(inmatrix16f);
1841         m1 = _mm_loadu_ps(inmatrix16f + 4);
1842         m2 = _mm_loadu_ps(inmatrix16f + 8);
1843         m3 = _mm_loadu_ps(inmatrix16f + 12);
1844         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1845         {
1846                 minpos = maxpos = _mm_loadu_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_loadu_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1853                         _mm_store_ps(out4f, v);
1854                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1855                         _mm_store_ps(screen4f, v);
1856                         in4f += 4;
1857                         out4f += 4;
1858                         screen4f += 4;
1859                 }
1860         }
1861         else
1862         {
1863                 minpos = maxpos = _mm_load_ps(in4f);
1864                 while (out4f < end)
1865                 {
1866                         __m128 v = _mm_load_ps(in4f);
1867                         minpos = _mm_min_ps(minpos, v);
1868                         maxpos = _mm_max_ps(maxpos, v);
1869                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1870                         _mm_store_ps(out4f, v);
1871                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1872                         _mm_store_ps(screen4f, v);
1873                         in4f += 4;
1874                         out4f += 4;
1875                         screen4f += 4;
1876                 }
1877         }
1878         if (starty && endy) 
1879         {
1880                 ALIGN(float minposf[4]);
1881                 ALIGN(float maxposf[4]);
1882                 _mm_store_ps(minposf, minpos);
1883                 _mm_store_ps(maxposf, maxpos);
1884                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1885         }
1886         return 0;
1887 }
1888 #endif
1889
1890 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1891 {
1892 #ifdef SSE2_PRESENT
1893         float *outf = dpsoftrast.post_array4f[outarray];
1894         const unsigned char *inb;
1895         int firstvertex = dpsoftrast.firstvertex;
1896         int numvertices = dpsoftrast.numvertices;
1897         int stride;
1898         switch(inarray)
1899         {
1900         case DPSOFTRAST_ARRAY_POSITION:
1901                 stride = dpsoftrast.stride_vertex;
1902                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1903                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1904                 break;
1905         case DPSOFTRAST_ARRAY_COLOR:
1906                 stride = dpsoftrast.stride_color;
1907                 if (dpsoftrast.pointer_color4f)
1908                 {
1909                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1910                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1911                 }
1912                 else if (dpsoftrast.pointer_color4ub)
1913                 {
1914                         stride = dpsoftrast.stride_color;
1915                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1916                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1917                 }
1918                 else
1919                 {
1920                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1921                 }
1922                 break;
1923         default:
1924                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1925                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1926                 {
1927                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1928                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1929                         {
1930                         case 2:
1931                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1932                                 break;
1933                         case 3:
1934                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935                                 break;
1936                         case 4:
1937                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1938                                 break;
1939                         }
1940                 }
1941                 break;
1942         }
1943         return outf;
1944 #else
1945         return NULL;
1946 #endif
1947 }
1948
1949 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1950 {
1951         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1952         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1953         return data;
1954 }
1955
1956 #if 0
1957 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1958 {
1959 #ifdef SSE2_PRESENT
1960         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1961         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1962         return data;
1963 #else
1964         return NULL;
1965 #endif
1966 }
1967 #endif
1968
1969 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1970 {
1971 #ifdef SSE2_PRESENT
1972         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1973         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1974         return data;
1975 #else
1976         return NULL;
1977 #endif
1978 }
1979
1980 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1981 {
1982         int x;
1983         int startx = span->startx;
1984         int endx = span->endx;
1985         float wslope = triangle->w[0];
1986         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1987         float endz = 1.0f / (w + wslope * startx);
1988         for (x = startx;x < endx;)
1989         {
1990                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1991                 float z = endz, dz;
1992                 if (nextsub >= endx) nextsub = endsub = endx-1;
1993                 endz = 1.0f / (w + wslope * nextsub);
1994                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1995                 for (; x <= endsub; x++, z += dz)
1996                         zf[x] = z;
1997         }
1998 }
1999
2000 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2001 {
2002         int x;
2003         int startx = span->startx;
2004         int endx = span->endx;
2005         int d[4];
2006         float a, b;
2007         unsigned char * RESTRICT pixelmask = span->pixelmask;
2008         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2009         if (!pixel)
2010                 return;
2011         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2012         // handle alphatest now (this affects depth writes too)
2013         if (thread->alphatest)
2014                 for (x = startx;x < endx;x++)
2015                         if (in4f[x*4+3] < 0.5f)
2016                                 pixelmask[x] = false;
2017         // FIXME: this does not handle bigendian
2018         switch(thread->fb_blendmode)
2019         {
2020         case DPSOFTRAST_BLENDMODE_OPAQUE:
2021                 for (x = startx;x < endx;x++)
2022                 {
2023                         if (!pixelmask[x])
2024                                 continue;
2025                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ALPHA:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         a = in4f[x*4+3] * 255.0f;
2041                         b = 1.0f - in4f[x*4+3];
2042                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2043                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2044                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2045                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2046                         pixel[x*4+0] = d[0];
2047                         pixel[x*4+1] = d[1];
2048                         pixel[x*4+2] = d[2];
2049                         pixel[x*4+3] = d[3];
2050                 }
2051                 break;
2052         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2053                 for (x = startx;x < endx;x++)
2054                 {
2055                         if (!pixelmask[x])
2056                                 continue;
2057                         a = in4f[x*4+3] * 255.0f;
2058                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2059                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2060                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2061                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2062                         pixel[x*4+0] = d[0];
2063                         pixel[x*4+1] = d[1];
2064                         pixel[x*4+2] = d[2];
2065                         pixel[x*4+3] = d[3];
2066                 }
2067                 break;
2068         case DPSOFTRAST_BLENDMODE_ADD:
2069                 for (x = startx;x < endx;x++)
2070                 {
2071                         if (!pixelmask[x])
2072                                 continue;
2073                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2074                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2075                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2076                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2077                         pixel[x*4+0] = d[0];
2078                         pixel[x*4+1] = d[1];
2079                         pixel[x*4+2] = d[2];
2080                         pixel[x*4+3] = d[3];
2081                 }
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_INVMOD:
2084                 for (x = startx;x < endx;x++)
2085                 {
2086                         if (!pixelmask[x])
2087                                 continue;
2088                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2089                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2090                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2091                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2092                         pixel[x*4+0] = d[0];
2093                         pixel[x*4+1] = d[1];
2094                         pixel[x*4+2] = d[2];
2095                         pixel[x*4+3] = d[3];
2096                 }
2097                 break;
2098         case DPSOFTRAST_BLENDMODE_MUL:
2099                 for (x = startx;x < endx;x++)
2100                 {
2101                         if (!pixelmask[x])
2102                                 continue;
2103                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2104                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2105                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2106                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2107                         pixel[x*4+0] = d[0];
2108                         pixel[x*4+1] = d[1];
2109                         pixel[x*4+2] = d[2];
2110                         pixel[x*4+3] = d[3];
2111                 }
2112                 break;
2113         case DPSOFTRAST_BLENDMODE_MUL2:
2114                 for (x = startx;x < endx;x++)
2115                 {
2116                         if (!pixelmask[x])
2117                                 continue;
2118                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         a = in4f[x*4+3] * -255.0f;
2134                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2135                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2136                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2137                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2138                         pixel[x*4+0] = d[0];
2139                         pixel[x*4+1] = d[1];
2140                         pixel[x*4+2] = d[2];
2141                         pixel[x*4+3] = d[3];
2142                 }
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2145                 for (x = startx;x < endx;x++)
2146                 {
2147                         if (!pixelmask[x])
2148                                 continue;
2149                         a = 255.0f;
2150                         b = 1.0f - in4f[x*4+3];
2151                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2152                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2153                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2154                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2155                         pixel[x*4+0] = d[0];
2156                         pixel[x*4+1] = d[1];
2157                         pixel[x*4+2] = d[2];
2158                         pixel[x*4+3] = d[3];
2159                 }
2160                 break;
2161         case DPSOFTRAST_BLENDMODE_INVADD:
2162                 for (x = startx;x < endx;x++)
2163                 {
2164                         if (!pixelmask[x])
2165                                 continue;
2166                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2167                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2168                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2169                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2170                         pixel[x*4+0] = d[0];
2171                         pixel[x*4+1] = d[1];
2172                         pixel[x*4+2] = d[2];
2173                         pixel[x*4+3] = d[3];
2174                 }
2175                 break;
2176         }
2177 }
2178
2179 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2180 {
2181 #ifdef SSE2_PRESENT
2182         int x;
2183         int startx = span->startx;
2184         int endx = span->endx;
2185         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2186         unsigned char * RESTRICT pixelmask = span->pixelmask;
2187         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2188         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2189         if (!pixel)
2190                 return;
2191         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2192         pixeli += span->y * dpsoftrast.fb_width + span->x;
2193         // handle alphatest now (this affects depth writes too)
2194         if (thread->alphatest)
2195                 for (x = startx;x < endx;x++)
2196                         if (in4ub[x*4+3] < 0.5f)
2197                                 pixelmask[x] = false;
2198         // FIXME: this does not handle bigendian
2199         switch(thread->fb_blendmode)
2200         {
2201         case DPSOFTRAST_BLENDMODE_OPAQUE:
2202                 for (x = startx;x + 4 <= endx;)
2203                 {
2204                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2205                         {
2206                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2207                                 x += 4;
2208                         }
2209                         else
2210                         {
2211                                 if (pixelmask[x])
2212                                         pixeli[x] = ini[x];
2213                                 x++;
2214                         }
2215                 }
2216                 for (;x < endx;x++)
2217                         if (pixelmask[x])
2218                                 pixeli[x] = ini[x];
2219                 break;
2220         case DPSOFTRAST_BLENDMODE_ALPHA:
2221         #define FINISHBLEND(blend2, blend1) \
2222                 for (x = startx;x + 1 < endx;x += 2) \
2223                 { \
2224                         __m128i src, dst; \
2225                         switch (*(const unsigned short*)&pixelmask[x]) \
2226                         { \
2227                         case 0x0101: \
2228                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2229                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2230                                 blend2; \
2231                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2232                                 continue; \
2233                         case 0x0100: \
2234                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2235                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2236                                 blend1; \
2237                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2238                                 continue; \
2239                         case 0x0001: \
2240                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2241                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2242                                 blend1; \
2243                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244                                 continue; \
2245                         } \
2246                         break; \
2247                 } \
2248                 for(;x < endx; x++) \
2249                 { \
2250                         __m128i src, dst; \
2251                         if (!pixelmask[x]) \
2252                                 continue; \
2253                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2254                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2255                         blend1; \
2256                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2257                 }
2258
2259                 FINISHBLEND({
2260                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2261                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2262                 }, {
2263                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2264                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2265                 });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2268                 FINISHBLEND({
2269                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271                 }, {
2272                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                 });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_ADD:
2277                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2278                 break;
2279         case DPSOFTRAST_BLENDMODE_INVMOD:
2280                 FINISHBLEND({
2281                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2282                 }, {
2283                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2284                 });
2285                 break;
2286         case DPSOFTRAST_BLENDMODE_MUL:
2287                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2288                 break;
2289         case DPSOFTRAST_BLENDMODE_MUL2:
2290                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2291                 break;
2292         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2293                 FINISHBLEND({
2294                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2295                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2296                 }, {
2297                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2298                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2299                 });
2300                 break;
2301         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2302                 FINISHBLEND({
2303                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2304                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2305                 }, {
2306                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2307                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2308                 });
2309                 break;
2310         case DPSOFTRAST_BLENDMODE_INVADD:
2311                 FINISHBLEND({
2312                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2313                 }, {
2314                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2315                 });
2316                 break;
2317         }
2318 #endif
2319 }
2320
2321 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2322 {
2323         int x;
2324         int startx = span->startx;
2325         int endx = span->endx;
2326         int flags;
2327         float c[4];
2328         float data[4];
2329         float slope[4];
2330         float tc[2], endtc[2];
2331         float tcscale[2];
2332         unsigned int tci[2];
2333         unsigned int tci1[2];
2334         unsigned int tcimin[2];
2335         unsigned int tcimax[2];
2336         int tciwrapmask[2];
2337         int tciwidth;
2338         int filter;
2339         int mip;
2340         const unsigned char * RESTRICT pixelbase;
2341         const unsigned char * RESTRICT pixel[4];
2342         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2343         // if no texture is bound, just fill it with white
2344         if (!texture)
2345         {
2346                 for (x = startx;x < endx;x++)
2347                 {
2348                         out4f[x*4+0] = 1.0f;
2349                         out4f[x*4+1] = 1.0f;
2350                         out4f[x*4+2] = 1.0f;
2351                         out4f[x*4+3] = 1.0f;
2352                 }
2353                 return;
2354         }
2355         mip = triangle->mip[texunitindex];
2356         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2357         // if this mipmap of the texture is 1 pixel, just fill it with that color
2358         if (texture->mipmap[mip][1] == 4)
2359         {
2360                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2361                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2362                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2363                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2364                 for (x = startx;x < endx;x++)
2365                 {
2366                         out4f[x*4+0] = c[0];
2367                         out4f[x*4+1] = c[1];
2368                         out4f[x*4+2] = c[2];
2369                         out4f[x*4+3] = c[3];
2370                 }
2371                 return;
2372         }
2373         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2374         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2375         flags = texture->flags;
2376         tcscale[0] = texture->mipmap[mip][2];
2377         tcscale[1] = texture->mipmap[mip][3];
2378         tciwidth = texture->mipmap[mip][2];
2379         tcimin[0] = 0;
2380         tcimin[1] = 0;
2381         tcimax[0] = texture->mipmap[mip][2]-1;
2382         tcimax[1] = texture->mipmap[mip][3]-1;
2383         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2384         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2385         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2386         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2387         for (x = startx;x < endx;)
2388         {
2389                 unsigned int subtc[2];
2390                 unsigned int substep[2];
2391                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2392                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2393                 if (nextsub >= endx)
2394                 {
2395                         nextsub = endsub = endx-1;      
2396                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2397                 }
2398                 tc[0] = endtc[0];
2399                 tc[1] = endtc[1];
2400                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2401                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2402                 substep[0] = (endtc[0] - tc[0]) * subscale;
2403                 substep[1] = (endtc[1] - tc[1]) * subscale;
2404                 subtc[0] = tc[0] * (1<<16);
2405                 subtc[1] = tc[1] * (1<<16);
2406                 if (filter)
2407                 {
2408                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2409                         {
2410                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2411                                 {
2412                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2413                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2414                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2415                                         tci[0] = subtc[0]>>16;
2416                                         tci[1] = subtc[1]>>16;
2417                                         tci1[0] = tci[0] + 1;
2418                                         tci1[1] = tci[1] + 1;
2419                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2420                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2421                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2422                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2423                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2424                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2425                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2426                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2427                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2428                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2429                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2430                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2431                                         out4f[x*4+0] = c[0];
2432                                         out4f[x*4+1] = c[1];
2433                                         out4f[x*4+2] = c[2];
2434                                         out4f[x*4+3] = c[3];
2435                                 }
2436                         }
2437                         else
2438                         {
2439                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2440                                 {
2441                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2442                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2443                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2444                                         tci[0] = subtc[0]>>16;
2445                                         tci[1] = subtc[1]>>16;
2446                                         tci1[0] = tci[0] + 1;
2447                                         tci1[1] = tci[1] + 1;
2448                                         tci[0] &= tciwrapmask[0];
2449                                         tci[1] &= tciwrapmask[1];
2450                                         tci1[0] &= tciwrapmask[0];
2451                                         tci1[1] &= tciwrapmask[1];
2452                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2453                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2454                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2455                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2456                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2457                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2458                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2459                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2460                                         out4f[x*4+0] = c[0];
2461                                         out4f[x*4+1] = c[1];
2462                                         out4f[x*4+2] = c[2];
2463                                         out4f[x*4+3] = c[3];
2464                                 }
2465                         }
2466                 }
2467                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2468                 {
2469                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2470                         {
2471                                 tci[0] = subtc[0]>>16;
2472                                 tci[1] = subtc[1]>>16;
2473                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2474                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2475                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2476                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2477                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2478                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2479                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2480                                 out4f[x*4+0] = c[0];
2481                                 out4f[x*4+1] = c[1];
2482                                 out4f[x*4+2] = c[2];
2483                                 out4f[x*4+3] = c[3];
2484                         }
2485                 }
2486                 else
2487                 {
2488                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2489                         {
2490                                 tci[0] = subtc[0]>>16;
2491                                 tci[1] = subtc[1]>>16;
2492                                 tci[0] &= tciwrapmask[0];
2493                                 tci[1] &= tciwrapmask[1];
2494                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2495                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2496                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2497                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2498                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2499                                 out4f[x*4+0] = c[0];
2500                                 out4f[x*4+1] = c[1];
2501                                 out4f[x*4+2] = c[2];
2502                                 out4f[x*4+3] = c[3];
2503                         }
2504                 }
2505         }
2506 }
2507
2508 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2509 {
2510 #ifdef SSE2_PRESENT
2511         int x;
2512         int startx = span->startx;
2513         int endx = span->endx;
2514         int flags;
2515         __m128 data, slope, tcscale;
2516         __m128i tcsize, tcmask, tcoffset, tcmax;
2517         __m128 tc, endtc;
2518         __m128i subtc, substep, endsubtc;
2519         int filter;
2520         int mip;
2521         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2522         const unsigned char * RESTRICT pixelbase;
2523         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2524         // if no texture is bound, just fill it with white
2525         if (!texture)
2526         {
2527                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2528                 return;
2529         }
2530         mip = triangle->mip[texunitindex];
2531         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2532         // if this mipmap of the texture is 1 pixel, just fill it with that color
2533         if (texture->mipmap[mip][1] == 4)
2534         {
2535                 unsigned int k = *((const unsigned int *)pixelbase);
2536                 for (x = startx;x < endx;x++)
2537                         outi[x] = k;
2538                 return;
2539         }
2540         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2541         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2542         flags = texture->flags;
2543         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2544         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2545         tcscale = _mm_cvtepi32_ps(tcsize);
2546         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2547         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2548         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2549         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2550         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2551         tcmax = _mm_packs_epi32(tcmask, tcmask);
2552         for (x = startx;x < endx;)
2553         {
2554                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2555                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2556                 if (nextsub >= endx)
2557                 {
2558                         nextsub = endsub = endx-1;
2559                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2560                 }       
2561                 tc = endtc;
2562                 subtc = endsubtc;
2563                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2564                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2565                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2566                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2567                 substep = _mm_slli_epi32(substep, 1);
2568                 if (filter)
2569                 {
2570                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2571                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2572                         {
2573                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2574                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2575                                 {
2576                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2577                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2578                                         tci = _mm_madd_epi16(tci, tcoffset);
2579                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2581                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2582                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2583                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2584                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2585                                         fracm = _mm_srli_epi16(subtc, 1);
2586                                         pix1 = _mm_add_epi16(pix1,
2587                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2589                                         pix3 = _mm_add_epi16(pix3,
2590                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2591                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2592                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2593                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2594                                         pix2 = _mm_add_epi16(pix2,
2595                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2596                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2597                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2598                                 }
2599                                 if (x <= endsub)
2600                                 {
2601                                         const unsigned char * RESTRICT ptr1;
2602                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2603                                         tci = _mm_madd_epi16(tci, tcoffset);
2604                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2605                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2606                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2607                                         fracm = _mm_srli_epi16(subtc, 1);
2608                                         pix1 = _mm_add_epi16(pix1,
2609                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2610                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2611                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2612                                         pix1 = _mm_add_epi16(pix1,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2615                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2616                                         x++;
2617                                 }
2618                         }
2619                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2620                         {
2621                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2622                                 {
2623                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2624                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628                                                                                         _mm_setzero_si128());
2629                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631                                                                                         _mm_setzero_si128());
2632                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2633                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637                                                                                         _mm_setzero_si128());
2638                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640                                                                                         _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix3 = _mm_add_epi16(pix3,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2647                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2648                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2649                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2650                                         pix2 = _mm_add_epi16(pix2,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2652                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2653                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2654                                 }
2655                                 if (x <= endsub)
2656                                 {
2657                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2658                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2662                                                                                         _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2665                                                                                         _mm_setzero_si128());
2666                                         fracm = _mm_srli_epi16(subtc, 1);
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2671                                         pix1 = _mm_add_epi16(pix1,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2674                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2675                                         x++;
2676                                 }
2677                         }
2678                         else
2679                         {
2680                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2681                                 {
2682                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2683                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690                                                                                         _mm_setzero_si128());
2691                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2692                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699                                                                                         _mm_setzero_si128());
2700                                         fracm = _mm_srli_epi16(subtc, 1);
2701                                         pix1 = _mm_add_epi16(pix1,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704                                         pix3 = _mm_add_epi16(pix3,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2706                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2707                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2708                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2709                                         pix2 = _mm_add_epi16(pix2,
2710                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2711                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2712                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2713                                 }
2714                                 if (x <= endsub)
2715                                 {
2716                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2717                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2718                                         tci = _mm_madd_epi16(tci, tcoffset);
2719                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2720                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2721                                                                                         _mm_setzero_si128());
2722                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2723                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2724                                                                                         _mm_setzero_si128());
2725                                         fracm = _mm_srli_epi16(subtc, 1);
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2729                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2730                                         pix1 = _mm_add_epi16(pix1,
2731                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2733                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2734                                         x++;
2735                                 }
2736                         }
2737                 }
2738                 else
2739                 {
2740                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2741                         {
2742                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2743                                 {
2744                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2745                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2746                                         tci = _mm_madd_epi16(tci, tcoffset);
2747                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2748                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2749                                 }
2750                                 if (x <= endsub)
2751                                 {
2752                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2753                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2754                                         tci = _mm_madd_epi16(tci, tcoffset);
2755                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2756                                         x++;
2757                                 }
2758                         }
2759                         else
2760                         {
2761                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2762                                 {
2763                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2764                                         tci = _mm_and_si128(tci, tcmax); 
2765                                         tci = _mm_madd_epi16(tci, tcoffset);
2766                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2767                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2768                                 }
2769                                 if (x <= endsub)
2770                                 {
2771                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2772                                         tci = _mm_and_si128(tci, tcmax); 
2773                                         tci = _mm_madd_epi16(tci, tcoffset);
2774                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2775                                         x++;
2776                                 }
2777                         }
2778                 }
2779         }
2780 #endif
2781 }
2782
2783 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2784 {
2785         // TODO: IMPLEMENT
2786         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2787 }
2788
2789 float DPSOFTRAST_SampleShadowmap(const float *vector)
2790 {
2791         // TODO: IMPLEMENT
2792         return 1.0f;
2793 }
2794
2795 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2796 {
2797         int x;
2798         int startx = span->startx;
2799         int endx = span->endx;
2800         float c[4];
2801         float data[4];
2802         float slope[4];
2803         float z;
2804         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2805         for (x = startx;x < endx;x++)
2806         {
2807                 z = zf[x];
2808                 c[0] = (data[0] + slope[0]*x) * z;
2809                 c[1] = (data[1] + slope[1]*x) * z;
2810                 c[2] = (data[2] + slope[2]*x) * z;
2811                 c[3] = (data[3] + slope[3]*x) * z;
2812                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2813                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2814                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2815                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2816         }
2817 }
2818
2819 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2820 {
2821         int x;
2822         int startx = span->startx;
2823         int endx = span->endx;
2824         float c[4];
2825         float data[4];
2826         float slope[4];
2827         float z;
2828         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2829         for (x = startx;x < endx;x++)
2830         {
2831                 z = zf[x];
2832                 c[0] = (data[0] + slope[0]*x) * z;
2833                 c[1] = (data[1] + slope[1]*x) * z;
2834                 c[2] = (data[2] + slope[2]*x) * z;
2835                 c[3] = (data[3] + slope[3]*x) * z;
2836                 out4f[x*4+0] = c[0];
2837                 out4f[x*4+1] = c[1];
2838                 out4f[x*4+2] = c[2];
2839                 out4f[x*4+3] = c[3];
2840         }
2841 }
2842
2843 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2844 {
2845         int x, startx = span->startx, endx = span->endx;
2846         float c[4], localcolor[4];
2847         localcolor[0] = subcolor[0];
2848         localcolor[1] = subcolor[1];
2849         localcolor[2] = subcolor[2];
2850         localcolor[3] = subcolor[3];
2851         for (x = startx;x < endx;x++)
2852         {
2853                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2854                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2855                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2856                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2857                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2858                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2859                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2860                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2861         }
2862 }
2863
2864 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 {
2866         int x, startx = span->startx, endx = span->endx;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2870                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2871                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2872                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2873         }
2874 }
2875
2876 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2877 {
2878         int x, startx = span->startx, endx = span->endx;
2879         for (x = startx;x < endx;x++)
2880         {
2881                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2882                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2883                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2884                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2885         }
2886 }
2887
2888 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2889 {
2890         int x, startx = span->startx, endx = span->endx;
2891         float a, b;
2892         for (x = startx;x < endx;x++)
2893         {
2894                 a = 1.0f - inb4f[x*4+3];
2895                 b = inb4f[x*4+3];
2896                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2897                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2898                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2899                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2900         }
2901 }
2902
2903 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2904 {
2905         int x, startx = span->startx, endx = span->endx;
2906         float localcolor[4], ilerp, lerp;
2907         localcolor[0] = color[0];
2908         localcolor[1] = color[1];
2909         localcolor[2] = color[2];
2910         localcolor[3] = color[3];
2911         ilerp = 1.0f - localcolor[3];
2912         lerp = localcolor[3];
2913         for (x = startx;x < endx;x++)
2914         {
2915                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2916                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2917                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2918                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2919         }
2920 }
2921
2922
2923
2924 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2925 {
2926 #ifdef SSE2_PRESENT
2927         int x;
2928         int startx = span->startx;
2929         int endx = span->endx;
2930         __m128 data, slope;
2931         __m128 mod, endmod;
2932         __m128i submod, substep, endsubmod;
2933         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2934         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2935         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2936         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2937         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2938         for (x = startx; x < endx;)
2939         {
2940                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2941                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2942                 if (nextsub >= endx)
2943                 {
2944                         nextsub = endsub = endx-1;
2945                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2946                 }
2947                 mod = endmod;
2948                 submod = endsubmod;
2949                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2950                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2951                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2952                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2953                 substep = _mm_packs_epi32(substep, substep);
2954                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2955                 {
2956                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2957                         pix = _mm_mulhi_epu16(pix, submod);
2958                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2959                 }
2960                 if (x <= endsub)
2961                 {
2962                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2963                         pix = _mm_mulhi_epu16(pix, submod);
2964                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2965                         x++;
2966                 }
2967         }
2968 #endif
2969 }
2970
2971 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2972 {
2973 #ifdef SSE2_PRESENT
2974         int x;
2975         int startx = span->startx;
2976         int endx = span->endx;
2977         __m128 data, slope;
2978         __m128 mod, endmod;
2979         __m128i submod, substep, endsubmod;
2980         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2981         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2982         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2983         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2984         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2985         for (x = startx; x < endx;)
2986         {
2987                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2988                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2989                 if (nextsub >= endx)
2990                 {
2991                         nextsub = endsub = endx-1;
2992                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2993                 }
2994                 mod = endmod;
2995                 submod = endsubmod;
2996                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2997                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2998                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2999                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3000                 substep = _mm_packs_epi32(substep, substep);
3001                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3002                 {
3003                         __m128i pix = _mm_srai_epi16(submod, 4);
3004                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3005                 }
3006                 if (x <= endsub)
3007                 {
3008                         __m128i pix = _mm_srai_epi16(submod, 4);
3009                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3010                         x++;
3011                 }
3012         }
3013 #endif
3014 }
3015
3016 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3017 {
3018 #ifdef SSE2_PRESENT
3019         int x, startx = span->startx, endx = span->endx;
3020         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3021         localcolor = _mm_packs_epi32(localcolor, localcolor);
3022         for (x = startx;x+2 <= endx;x+=2)
3023         {
3024                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3025                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3026                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3027                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3028         }
3029         if (x < endx)
3030         {
3031                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3032                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3033                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3034                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035         }
3036 #endif
3037 }
3038
3039 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3040 {
3041 #ifdef SSE2_PRESENT
3042         int x, startx = span->startx, endx = span->endx;
3043         for (x = startx;x+2 <= endx;x+=2)
3044         {
3045                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3046                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3047                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3048                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3049         }
3050         if (x < endx)
3051         {
3052                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3053                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3054                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3055                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3056         }
3057 #endif
3058 }
3059
3060 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3061 {
3062 #ifdef SSE2_PRESENT
3063         int x, startx = span->startx, endx = span->endx;
3064         for (x = startx;x+2 <= endx;x+=2)
3065         {
3066                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068                 pix1 = _mm_add_epi16(pix1, pix2);
3069                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070         }
3071         if (x < endx)
3072         {
3073                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075                 pix1 = _mm_add_epi16(pix1, pix2);
3076                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3077         }
3078 #endif
3079 }
3080
3081 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3082 {
3083 #ifdef SSE2_PRESENT
3084         int x, startx = span->startx, endx = span->endx;
3085         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3086         tint = _mm_packs_epi32(tint, tint);
3087         for (x = startx;x+2 <= endx;x+=2)
3088         {
3089                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3090                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3091                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3092                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093         }
3094         if (x < endx)
3095         {
3096                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3097                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3098                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3099                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3100         }
3101 #endif
3102 }
3103
3104 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3105 {
3106 #ifdef SSE2_PRESENT
3107         int x, startx = span->startx, endx = span->endx;
3108         for (x = startx;x+2 <= endx;x+=2)
3109         {
3110                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3111                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3112                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3113                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3114                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3115         }
3116         if (x < endx)
3117         {
3118                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3120                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3121                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3122                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3123         }
3124 #endif
3125 }
3126
3127 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3128 {
3129 #ifdef SSE2_PRESENT
3130         int x, startx = span->startx, endx = span->endx;
3131         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3132         localcolor = _mm_packs_epi32(localcolor, localcolor);
3133         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3134         for (x = startx;x+2 <= endx;x+=2)
3135         {
3136                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3137                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3138                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3139         }
3140         if (x < endx)
3141         {
3142                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3143                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3144                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3145         }
3146 #endif
3147 }
3148
3149
3150
3151 void DPSOFTRAST_VertexShader_Generic(void)
3152 {
3153         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3154         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3155         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3156         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3157                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3158 }
3159
3160 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3161 {
3162         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3163         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3164         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3165         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3166         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3167         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3168         {
3169                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3170                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3171                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3172                 {
3173                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3174                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3175                         {
3176                                 // multiply
3177                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3178                         }
3179                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3180                         {
3181                                 // add
3182                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3183                         }
3184                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3185                         {
3186                                 // alphablend
3187                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3188                         }
3189                 }
3190         }
3191         else
3192                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3193         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3194 }
3195
3196
3197
3198 void DPSOFTRAST_VertexShader_PostProcess(void)
3199 {
3200         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3201         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3202         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3203 }
3204
3205 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 {
3207         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3208         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3209         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3211         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3212         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3213         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3214         {
3215                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3216                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3217         }
3218         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3219         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3220         {
3221                 // TODO: implement saturation
3222         }
3223         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3224         {
3225                 // TODO: implement gammaramps
3226         }
3227         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3228 }
3229
3230
3231
3232 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3233 {
3234         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3235 }
3236
3237 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3238 {
3239         // this is never called (because colormask is off when this shader is used)
3240         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3241         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3242         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3243         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3244         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3245 }
3246
3247
3248
3249 void DPSOFTRAST_VertexShader_FlatColor(void)
3250 {
3251         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3252         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3253 }
3254
3255 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3256 {
3257 #ifdef SSE2_PRESENT
3258         unsigned char * RESTRICT pixelmask = span->pixelmask;
3259         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3260         int x, startx = span->startx, endx = span->endx;
3261         __m128i Color_Ambientm;
3262         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3266         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3267         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3268                 pixel = buffer_FragColorbgra8;
3269         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3270         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3271         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3272         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3273         for (x = startx;x < endx;x++)
3274         {
3275                 __m128i color, pix;
3276                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3277                 {
3278                         __m128i pix2;
3279                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3280                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3281                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3282                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3283                         x += 3;
3284                         continue;
3285                 }
3286                 if (!pixelmask[x])
3287                         continue;
3288                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3289                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3290                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3291         }
3292         if (pixel == buffer_FragColorbgra8)
3293                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 #endif
3295 }
3296
3297
3298
3299 void DPSOFTRAST_VertexShader_VertexColor(void)
3300 {
3301         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3302         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3303         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3304 }
3305
3306 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3307 {
3308 #ifdef SSE2_PRESENT
3309         unsigned char * RESTRICT pixelmask = span->pixelmask;
3310         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3311         int x, startx = span->startx, endx = span->endx;
3312         __m128i Color_Ambientm, Color_Diffusem;
3313         __m128 data, slope;
3314         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3317         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3318         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3319         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3320         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3321                 pixel = buffer_FragColorbgra8;
3322         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3323         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3324         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3325         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3326         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3327         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3328         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3329         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3330         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3331         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3332         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3333         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3334         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3335         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3336         {
3337                 __m128i color, mod, pix;
3338                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3339                 {
3340                         __m128i pix2, mod2;
3341                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3342                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3343                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3344                         data = _mm_add_ps(data, slope);
3345                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3346                         data = _mm_add_ps(data, slope);
3347                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3348                         data = _mm_add_ps(data, slope);
3349                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3350                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3351                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3352                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3353                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3354                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3355                         x += 3;
3356                         continue;
3357                 }
3358                 if (!pixelmask[x])
3359                         continue;
3360                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3361                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3362                 mod = _mm_packs_epi32(mod, mod);
3363                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3364                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3365         }
3366         if (pixel == buffer_FragColorbgra8)
3367                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 #endif
3369 }
3370
3371
3372
3373 void DPSOFTRAST_VertexShader_Lightmap(void)
3374 {
3375         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3377         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3378 }
3379
3380 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 {
3382 #ifdef SSE2_PRESENT
3383         unsigned char * RESTRICT pixelmask = span->pixelmask;
3384         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385         int x, startx = span->startx, endx = span->endx;
3386         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3387         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3388         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3394         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3395         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3396                 pixel = buffer_FragColorbgra8;
3397         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3398         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3399         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3400         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3401         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3404         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3405         {
3406                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3407                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3408                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3409                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3410                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3411                 for (x = startx;x < endx;x++)
3412                 {
3413                         __m128i color, lightmap, glow, pix;
3414                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3415                         {