detect whether dpsoftrast is being built with SDL
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifdef USE_THREADS
13 #include <SDL.h>
14 #include <SDL_thread.h>
15 #endif
16
17 #ifndef __cplusplus
18 typedef qboolean bool;
19 #endif
20
21 #define ALIGN_SIZE 16
22 #define ATOMIC_SIZE 32
23
24 #if defined(__GNUC__)
25 #define ALIGN(var) var __attribute__((__aligned__(16)))
26 #define ATOMIC(var) var __attribute__((__aligned__(32)))
27 #define MEMORY_BARRIER (_mm_sfence())
28 //(__sync_synchronize())
29 #elif defined(_MSC_VER)
30 #define ALIGN(var) __declspec(align(16)) var
31 #define ATOMIC(var) __declspec(align(32)) var
32 #define MEMORY_BARRIER (_mm_sfence())
33 //(MemoryBarrier())
34 #else
35 #define ALIGN(var) var
36 #define ATOMIC(var) var
37 #define MEMORY_BARRIER ((void)0)
38 #endif
39
40 #if !defined(USE_THREADS) || !defined(SSE2_PRESENT)
41 #undef MEMORY_BARRIER
42 #define MEMORY_BARRIER ((void)0)
43 #endif
44
45 #ifdef SSE2_PRESENT
46 #include <emmintrin.h>
47
48 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
49
50 static void *MM_CALLOC(size_t nmemb, size_t size)
51 {
52         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
53         if(ptr != NULL) memset(ptr, 0, nmemb*size);
54         return ptr;
55 }
56
57 #define MM_FREE _mm_free
58 #else
59 #define MM_MALLOC(size) malloc(size)
60 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
61 #define MM_FREE free
62 #endif
63
64 typedef enum DPSOFTRAST_ARRAY_e
65 {
66         DPSOFTRAST_ARRAY_POSITION,
67         DPSOFTRAST_ARRAY_COLOR,
68         DPSOFTRAST_ARRAY_TEXCOORD0,
69         DPSOFTRAST_ARRAY_TEXCOORD1,
70         DPSOFTRAST_ARRAY_TEXCOORD2,
71         DPSOFTRAST_ARRAY_TEXCOORD3,
72         DPSOFTRAST_ARRAY_TEXCOORD4,
73         DPSOFTRAST_ARRAY_TEXCOORD5,
74         DPSOFTRAST_ARRAY_TEXCOORD6,
75         DPSOFTRAST_ARRAY_TEXCOORD7,
76         DPSOFTRAST_ARRAY_TOTAL
77 }
78 DPSOFTRAST_ARRAY;
79
80 typedef struct DPSOFTRAST_Texture_s
81 {
82         int flags;
83         int width;
84         int height;
85         int depth;
86         int sides;
87         DPSOFTRAST_TEXTURE_FILTER filter;
88         int mipmaps;
89         int size;
90         unsigned char *bytes;
91         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
92 }
93 DPSOFTRAST_Texture;
94
95 #define COMMAND_SIZE ALIGN_SIZE
96 #define COMMAND_ALIGN(var) ALIGN(var)
97
98 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
99 {
100         int opcode;
101 }
102 DPSOFTRAST_Command);
103
104 enum { DPSOFTRAST_OPCODE_Reset = 0 };
105
106 #define DEFCOMMAND(opcodeval, name, fields) \
107         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
108         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
109         { \
110                 int opcode; \
111                 fields \
112         } DPSOFTRAST_Command_##name );
113
114 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
115
116 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
117 {
118         int freecommand;
119         int usedcommands;
120         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
121 }
122 DPSOFTRAST_State_Command_Pool);
123
124 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
125 {
126         int commandoffset;
127         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
128         int starty;
129         int endy;
130         int numpoints;
131         float w[3];
132         ALIGN(float coords[4][4]);
133         ALIGN(int ycoords[4]);
134         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
135 }
136 DPSOFTRAST_State_Triangle);
137
138 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
139         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
140         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
141                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
142                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
143 }
144 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
145         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
146         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
147         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
148         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
149         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
150         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
151         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
152         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
153 }
154                                         
155 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
156
157 typedef ALIGN(struct DPSOFTRAST_State_Span_s
158 {
159         int triangle; // triangle this span was generated by
160         int x; // framebuffer x coord
161         int y; // framebuffer y coord
162         int length; // pixel count
163         int startx; // usable range (according to pixelmask)
164         int endx; // usable range (according to pixelmask)
165         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
166 }
167 DPSOFTRAST_State_Span);
168
169 #define DPSOFTRAST_DRAW_MAXSPANS 1024
170
171 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
172 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
173
174 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
175 {
176         int freetriangle;
177         int usedtriangles;
178         ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
179 }
180 DPSOFTRAST_State_Triangle_Pool);
181
182 #define DPSOFTRAST_VALIDATE_FB 1
183 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
184 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
185 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
186
187 typedef enum DPSOFTRAST_BLENDMODE_e
188 {
189         DPSOFTRAST_BLENDMODE_OPAQUE,
190         DPSOFTRAST_BLENDMODE_ALPHA,
191         DPSOFTRAST_BLENDMODE_ADDALPHA,
192         DPSOFTRAST_BLENDMODE_ADD,
193         DPSOFTRAST_BLENDMODE_INVMOD,
194         DPSOFTRAST_BLENDMODE_MUL,
195         DPSOFTRAST_BLENDMODE_MUL2,
196         DPSOFTRAST_BLENDMODE_SUBALPHA,
197         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
198         DPSOFTRAST_BLENDMODE_TOTAL
199 }
200 DPSOFTRAST_BLENDMODE;
201
202 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
203 {
204 #ifdef USE_THREADS
205         SDL_Thread *thread;
206 #endif
207         int index;
208         
209         int colormask[4];
210         int blendfunc[2];
211         int blendsubtract;
212         int depthmask;
213         int depthtest;
214         int depthfunc;
215         int scissortest;
216         int alphatest;
217         int alphafunc;
218         float alphavalue;
219         int scissor[4];
220         int viewport[4];
221         float depthrange[2];
222         float polygonoffset[2];
223
224         int shader_mode;
225         int shader_permutation;
226
227         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
228         
229         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
230         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
231
232         // DPSOFTRAST_VALIDATE_ flags
233         int validate;
234
235         // derived values (DPSOFTRAST_VALIDATE_FB)
236         int fb_colormask;
237         int fb_clearscissor[4];
238
239         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
240         int fb_depthfunc;
241
242         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
243         int fb_blendmode;
244
245         ATOMIC(int commandoffset);
246         int triangleoffset;
247
248         bool waiting;
249 #ifdef USE_THREADS
250         SDL_cond *waitcond;
251 #endif
252
253         int numspans;
254         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
255 }
256 DPSOFTRAST_State_Thread);
257
258 typedef ATOMIC(struct DPSOFTRAST_State_s
259 {
260         int fb_width;
261         int fb_height;
262         unsigned int *fb_depthpixels;
263         unsigned int *fb_colorpixels[4];
264
265         int viewport[4];
266         ALIGN(float fb_viewportcenter[4]);
267         ALIGN(float fb_viewportscale[4]);
268
269         float color[4];
270         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
271         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
272
273         int cullface;
274
275         const float *pointer_vertex3f;
276         const float *pointer_color4f;
277         const unsigned char *pointer_color4ub;
278         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
279         int stride_vertex;
280         int stride_color;
281         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
282         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
283         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
284
285         int numvertices;
286         int maxvertices;
287         float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
288         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
289         float *screencoord4f;
290
291         int shader_mode;
292         int shader_permutation;
293
294         int texture_max;
295         int texture_end;
296         int texture_firstfree;
297         DPSOFTRAST_Texture *texture;
298
299         int bigendian;
300
301         // error reporting
302         const char *errorstring;
303
304         int numthreads;
305         DPSOFTRAST_State_Thread *threads;
306 #ifdef USE_THREADS
307         SDL_mutex *trianglemutex;
308         SDL_cond *trianglecond;
309 #endif
310
311         ATOMIC(int drawtriangle);
312
313         DPSOFTRAST_State_Command_Pool commandpool;
314         DPSOFTRAST_State_Triangle_Pool trianglepool;
315 }
316 DPSOFTRAST_State);
317
318 DPSOFTRAST_State dpsoftrast;
319
320 extern int dpsoftrast_test;
321
322 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
323 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
324 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
325 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
326 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
327
328 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
329 {
330         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
331         // and viewport projection values
332         int x1, x2;
333         int y1, y2;
334         x1 = thread->scissor[0];
335         x2 = thread->scissor[0] + thread->scissor[2];
336         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
337         y2 = dpsoftrast.fb_height - thread->scissor[1];
338         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
339         if (x1 < 0) x1 = 0;
340         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
341         if (y1 < 0) y1 = 0;
342         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
343         thread->fb_clearscissor[0] = x1;
344         thread->fb_clearscissor[1] = y1;
345         thread->fb_clearscissor[2] = x2 - x1;
346         thread->fb_clearscissor[3] = y2 - y1;
347 }
348
349 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
350 {
351         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
352 }
353
354 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
355 {
356         if (thread->blendsubtract)
357         {
358                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
359                 {
360                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
361                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
362                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
363                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
364                 }
365         }
366         else
367         {       
368             switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
369             {
370                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
371                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
372                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
373                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
374                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
375                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
376                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
377                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
378                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
379                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
380                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
381             }
382         }
383 }
384
385 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
386
387 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
388 {
389         mask &= thread->validate;
390         if (!mask)
391                 return;
392         if (mask & DPSOFTRAST_VALIDATE_FB)
393         {
394                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
395                 DPSOFTRAST_RecalcFB(thread);
396         }
397         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
398         {
399                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
400                 DPSOFTRAST_RecalcDepthFunc(thread);
401         }
402         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
403         {
404                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
405                 DPSOFTRAST_RecalcBlendFunc(thread);
406         }
407 }
408
409 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
410 {
411         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
412                 return &dpsoftrast.texture[index];
413         return NULL;
414 }
415
416 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
417 {
418         int w;
419         int h;
420         int d;
421         int size;
422         int s;
423         int texnum;
424         int mipmaps;
425         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
426         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
427         DPSOFTRAST_Texture *texture;
428         if (width*height*depth < 1)
429         {
430                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
431                 return 0;
432         }
433         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
434         {
435                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
436                 return 0;
437         }
438         switch(texformat)
439         {
440         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
441         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
442         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
443                 break;
444         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
445                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
446                 {
447                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
448                         return 0;
449                 }
450                 if (depth != 1)
451                 {
452                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
453                         return 0;
454                 }
455                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
456                 {
457                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
458                         return 0;
459                 }
460                 break;
461         }
462         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
463         {
464                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
465                 return 0;
466         }
467         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
468         {
469                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
470                 return 0;
471         }
472         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
473         {
474                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
475                 return 0;
476         }
477         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
478         {
479                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
480                 return 0;
481         }
482         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
483         {
484                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
485                 return 0;
486         }
487         DPSOFTRAST_Flush();
488         // find first empty slot in texture array
489         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
490                 if (!dpsoftrast.texture[texnum].bytes)
491                         break;
492         dpsoftrast.texture_firstfree = texnum + 1;
493         if (dpsoftrast.texture_max <= texnum)
494         {
495                 // expand texture array as needed
496                 if (dpsoftrast.texture_max < 1024)
497                         dpsoftrast.texture_max = 1024;
498                 else
499                         dpsoftrast.texture_max *= 2;
500                 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
501         }
502         if (dpsoftrast.texture_end <= texnum)
503                 dpsoftrast.texture_end = texnum + 1;
504         texture = &dpsoftrast.texture[texnum];
505         memset(texture, 0, sizeof(*texture));
506         texture->flags = flags;
507         texture->width = width;
508         texture->height = height;
509         texture->depth = depth;
510         texture->sides = sides;
511         w = width;
512         h = height;
513         d = depth;
514         size = 0;
515         mipmaps = 0;
516         w = width;
517         h = height;
518         d = depth;
519         for (;;)
520         {
521                 s = w * h * d * sides * 4;
522                 texture->mipmap[mipmaps][0] = size;
523                 texture->mipmap[mipmaps][1] = s;
524                 texture->mipmap[mipmaps][2] = w;
525                 texture->mipmap[mipmaps][3] = h;
526                 texture->mipmap[mipmaps][4] = d;
527                 size += s;
528                 mipmaps++;
529                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530                         break;
531                 if (w > 1) w >>= 1;
532                 if (h > 1) h >>= 1;
533                 if (d > 1) d >>= 1;
534         }
535         texture->mipmaps = mipmaps;
536         texture->size = size;
537
538         // allocate the pixels now
539         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
540
541         return texnum;
542 }
543 void DPSOFTRAST_Texture_Free(int index)
544 {
545         DPSOFTRAST_Texture *texture;
546         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
547         DPSOFTRAST_Flush();
548         if (texture->bytes)
549                 MM_FREE(texture->bytes);
550         texture->bytes = NULL;
551         memset(texture, 0, sizeof(*texture));
552         // adjust the free range and used range
553         if (dpsoftrast.texture_firstfree > index)
554                 dpsoftrast.texture_firstfree = index;
555         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
556                 dpsoftrast.texture_end--;
557 }
558 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
559 {
560         int i, x, y, z, w, layer0, layer1, row0, row1;
561         unsigned char *o, *i0, *i1, *i2, *i3;
562         DPSOFTRAST_Texture *texture;
563         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
564         if (texture->mipmaps <= 1)
565                 return;
566         for (i = 1;i < texture->mipmaps;i++)
567         {
568                 for (z = 0;z < texture->mipmap[i][4];z++)
569                 {
570                         layer0 = z*2;
571                         layer1 = z*2+1;
572                         if (layer1 >= texture->mipmap[i-1][4])
573                                 layer1 = texture->mipmap[i-1][4]-1;
574                         for (y = 0;y < texture->mipmap[i][3];y++)
575                         {
576                                 row0 = y*2;
577                                 row1 = y*2+1;
578                                 if (row1 >= texture->mipmap[i-1][3])
579                                         row1 = texture->mipmap[i-1][3]-1;
580                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
581                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
582                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
583                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
584                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
585                                 w = texture->mipmap[i][2];
586                                 if (layer1 > layer0)
587                                 {
588                                         if (texture->mipmap[i-1][2] > 1)
589                                         {
590                                                 // average 3D texture
591                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
592                                                 {
593                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
594                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
595                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
596                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
597                                                 }
598                                         }
599                                         else
600                                         {
601                                                 // average 3D mipmap with parent width == 1
602                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
603                                                 {
604                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
605                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
606                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
607                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
608                                                 }
609                                         }
610                                 }
611                                 else
612                                 {
613                                         if (texture->mipmap[i-1][2] > 1)
614                                         {
615                                                 // average 2D texture (common case)
616                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
617                                                 {
618                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
619                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
620                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
621                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
622                                                 }
623                                         }
624                                         else
625                                         {
626                                                 // 2D texture with parent width == 1
627                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
628                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
629                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
630                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
631                                         }
632                                 }
633                         }
634                 }
635         }
636 }
637 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
638 {
639         DPSOFTRAST_Texture *texture;
640         unsigned char *dst;
641         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
642         DPSOFTRAST_Flush();
643         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
644         while (blockheight > 0)
645         {
646                 memcpy(dst, pixels, blockwidth * 4);
647                 pixels += blockwidth * 4;
648                 dst += texture->mipmap[0][2] * 4;
649                 blockheight--;
650         }
651         DPSOFTRAST_Texture_CalculateMipmaps(index);
652 }
653 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
654 {
655         DPSOFTRAST_Texture *texture;
656         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
657         DPSOFTRAST_Flush();
658         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
659         DPSOFTRAST_Texture_CalculateMipmaps(index);
660 }
661 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
662 {
663         DPSOFTRAST_Texture *texture;
664         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
665         return texture->mipmap[mip][2];
666 }
667 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
668 {
669         DPSOFTRAST_Texture *texture;
670         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
671         return texture->mipmap[mip][3];
672 }
673 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
674 {
675         DPSOFTRAST_Texture *texture;
676         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
677         return texture->mipmap[mip][4];
678 }
679 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
680 {
681         DPSOFTRAST_Texture *texture;
682         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
683         DPSOFTRAST_Flush();
684         return texture->bytes + texture->mipmap[mip][0];
685 }
686 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
687 {
688         DPSOFTRAST_Texture *texture;
689         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
691         {
692                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
693                 return;
694         }
695         DPSOFTRAST_Flush();
696         texture->filter = filter;
697 }
698
699 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
700 {
701         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
702                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
703                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
704                 DPSOFTRAST_Flush();
705         dpsoftrast.fb_width = width;
706         dpsoftrast.fb_height = height;
707         dpsoftrast.fb_depthpixels = depthpixels;
708         dpsoftrast.fb_colorpixels[0] = colorpixels0;
709         dpsoftrast.fb_colorpixels[1] = colorpixels1;
710         dpsoftrast.fb_colorpixels[2] = colorpixels2;
711         dpsoftrast.fb_colorpixels[3] = colorpixels3;
712 }
713
714 void DPSOFTRAST_Draw_FlushThreads(void);
715
716 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
717 {
718         DPSOFTRAST_State_Thread *thread;
719         int i;
720         int freetriangle = dpsoftrast.trianglepool.freetriangle;
721         int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
722         if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
723             return;
724 #ifdef USE_THREADS
725         SDL_LockMutex(dpsoftrast.trianglemutex);
726 #endif
727         for(;;)
728         {
729             int waitindex = -1;
730             int triangleoffset;
731             usedtriangles = 0;
732             for (i = 0; i < dpsoftrast.numthreads; i++)
733             {
734                 thread = &dpsoftrast.threads[i];
735                 triangleoffset = freetriangle - thread->triangleoffset;
736                 if (triangleoffset < 0)
737                     triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
738                 if (triangleoffset > usedtriangles)
739                 {
740                     waitindex = i;
741                     usedtriangles = triangleoffset;
742                 }
743             }
744             if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
745                 break;
746 #ifdef USE_THREADS
747             thread = &dpsoftrast.threads[waitindex];
748             thread->waiting = true;
749             SDL_CondBroadcast(dpsoftrast.trianglecond);
750             SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
751             thread->waiting = false;
752 #endif
753         }
754 #ifdef USE_THREADS
755         SDL_UnlockMutex(dpsoftrast.trianglemutex);
756 #endif
757         dpsoftrast.trianglepool.usedtriangles = usedtriangles;
758 }
759
760 void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         DPSOFTRAST_State_Triangle *triangle;
763         if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
764 #ifdef USE_THREADS
765             DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
766 #else
767             DPSOFTRAST_Draw_FlushThreads();
768 #endif
769         triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
770         triangle->commandoffset = dpsoftrast.commandpool.freecommand;
771         triangle->starty = -1;
772         triangle->endy = -1;
773         dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
774         dpsoftrast.trianglepool.usedtriangles++;
775         MEMORY_BARRIER;
776         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
777 }
778
779 void DPSOFTRAST_Draw_FreeCommandPool(int space)
780 {
781         DPSOFTRAST_State_Thread *thread;
782         int i;
783         int freecommand = dpsoftrast.commandpool.freecommand;
784         int usedcommands = dpsoftrast.commandpool.usedcommands;
785         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
786                 return;
787         DPSOFTRAST_Draw_SyncCommands();
788 #ifdef USE_THREADS
789         SDL_LockMutex(dpsoftrast.trianglemutex);
790 #endif
791         for(;;)
792         {
793                 int waitindex = -1;
794                 int commandoffset;
795                 usedcommands = 0;
796                 for (i = 0; i < dpsoftrast.numthreads; i++)
797                 {
798                         thread = &dpsoftrast.threads[i]; 
799                         commandoffset = freecommand - thread->commandoffset;
800                         if (commandoffset < 0)
801                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
802                         if (commandoffset > usedcommands)
803                         {
804                                 waitindex = i;
805                                 usedcommands = commandoffset;
806                         }
807                 }
808                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
809                         break;
810 #ifdef USE_THREADS
811                 thread = &dpsoftrast.threads[waitindex];
812                 thread->waiting = true;
813                 SDL_CondBroadcast(dpsoftrast.trianglecond);
814                 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
815                 thread->waiting = false;
816 #endif
817         }
818 #ifdef USE_THREADS
819         SDL_UnlockMutex(dpsoftrast.trianglemutex);
820 #endif
821         dpsoftrast.commandpool.usedcommands = usedcommands;
822 }
823
824 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
825         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
826
827 static void *DPSOFTRAST_AllocateCommand(int size)
828 {
829         DPSOFTRAST_Command *command;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         int extra = sizeof(DPSOFTRAST_Command);
833         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
834                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
836         {
837 #ifdef USE_THREADS
838                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
839 #else
840                 DPSOFTRAST_Draw_FlushThreads();
841 #endif
842                 freecommand = dpsoftrast.commandpool.freecommand;
843                 usedcommands = dpsoftrast.commandpool.usedcommands;
844         }
845         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
846         {
847                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
848                 command->opcode = DPSOFTRAST_OPCODE_Reset;
849                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
850                 freecommand = 0;
851         }
852         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
853         freecommand += size;
854         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
855                 freecommand = 0;
856
857         dpsoftrast.commandpool.freecommand = freecommand;
858         dpsoftrast.commandpool.usedcommands = usedcommands + size;
859         return command;
860 }
861         
862 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
863 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
864 {
865         thread->viewport[0] = command->x;
866         thread->viewport[1] = command->y;
867         thread->viewport[2] = command->width;
868         thread->viewport[3] = command->height;
869         thread->validate |= DPSOFTRAST_VALIDATE_FB;
870 }
871 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
872 {
873         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874         command->opcode = DPSOFTRAST_OPCODE_Viewport;
875         command->x = x;
876         command->y = y;
877         command->width = width;
878         command->height = height;
879
880         dpsoftrast.viewport[0] = x;
881         dpsoftrast.viewport[1] = y;
882         dpsoftrast.viewport[2] = width;
883         dpsoftrast.viewport[3] = height;
884         dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
885         dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
886         dpsoftrast.fb_viewportcenter[3] = 0.5f;
887         dpsoftrast.fb_viewportcenter[0] = 0.0f;
888         dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
889         dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
890         dpsoftrast.fb_viewportscale[3] = 0.5f;
891         dpsoftrast.fb_viewportscale[0] = 1.0f;
892 }
893
894 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
895 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 {
897         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
898         unsigned int *p;
899         unsigned int c;
900         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
901         x1 = thread->fb_clearscissor[0];
902         y1 = thread->fb_clearscissor[1];
903         x2 = thread->fb_clearscissor[2];
904         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
905         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
906         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
907         if(y1 < t1) y1 = t1;
908         if(y2 > t2) y2 = t2;
909         w = x2 - x1;
910         h = y2 - y1;
911         if (w < 1 || h < 1)
912                 return;
913         // FIXME: honor fb_colormask?
914         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915         for (i = 0;i < 4;i++)
916         {
917                 if (!dpsoftrast.fb_colorpixels[i])
918                         continue;
919                 for (y = y1;y < y2;y++)
920                 {
921                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
922                         for (x = x1;x < x2;x++)
923                                 p[x] = c;
924                 }
925         }
926 }
927 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
928 {
929         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930         command->opcode = DPSOFTRAST_OPCODE_ClearColor;
931         command->r = r;
932         command->g = g;
933         command->b = b;
934         command->a = a;
935 }
936
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
939 {
940         int x1, y1, x2, y2, w, h, x, y, t1, t2;
941         unsigned int *p;
942         unsigned int c;
943         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
944         x1 = thread->fb_clearscissor[0];
945         y1 = thread->fb_clearscissor[1];
946         x2 = thread->fb_clearscissor[2];
947         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
948         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
949         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
950         if(y1 < t1) y1 = t1;
951         if(y2 > t2) y2 = t2;
952         w = x2 - x1;
953         h = y2 - y1;
954         if (w < 1 || h < 1)
955                 return;
956         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
957         for (y = y1;y < y2;y++)
958         {
959                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
960                 for (x = x1;x < x2;x++)
961                         p[x] = c;
962         }
963 }
964 void DPSOFTRAST_ClearDepth(float d)
965 {
966         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967         command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
968         command->depth = d;
969 }
970
971 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
972 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
973 {
974         thread->colormask[0] = command->r != 0;
975         thread->colormask[1] = command->g != 0;
976         thread->colormask[2] = command->b != 0;
977         thread->colormask[3] = command->a != 0;
978         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
979 }
980 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
981 {
982         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
983         command->opcode = DPSOFTRAST_OPCODE_ColorMask;
984         command->r = r;
985         command->g = g;
986         command->b = b;
987         command->a = a;
988 }
989
990 DEFCOMMAND(5, DepthTest, int enable;)
991 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
992 {
993         thread->depthtest = command->enable;
994         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
995 }
996 void DPSOFTRAST_DepthTest(int enable)
997 {
998         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
999         command->opcode = DPSOFTRAST_OPCODE_DepthTest;
1000         command->enable = enable;
1001 }
1002
1003 DEFCOMMAND(6, ScissorTest, int enable;)
1004 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1005 {
1006         thread->scissortest = command->enable;
1007         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1008 }
1009 void DPSOFTRAST_ScissorTest(int enable)
1010 {
1011         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1012         command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1018 {
1019         thread->scissor[0] = command->x;
1020         thread->scissor[1] = command->y;
1021         thread->scissor[2] = command->width;
1022         thread->scissor[3] = command->height;
1023         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1024 }
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1026 {
1027         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1028         command->opcode = DPSOFTRAST_OPCODE_Scissor;
1029         command->x = x;
1030         command->y = y;
1031         command->width = width;
1032         command->height = height;
1033 }
1034
1035 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1036 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 {
1038         thread->blendfunc[0] = command->sfactor;
1039         thread->blendfunc[1] = command->dfactor;
1040         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 }
1042 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 {
1044         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1045         command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1060         command->enable = enable;
1061 }
1062
1063 DEFCOMMAND(10, DepthMask, int enable;)
1064 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 {
1066         thread->depthmask = command->enable;
1067 }
1068 void DPSOFTRAST_DepthMask(int enable)
1069 {
1070         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1071         command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1072         command->enable = enable;
1073 }
1074
1075 DEFCOMMAND(11, DepthFunc, int func;)
1076 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1077 {
1078         thread->depthfunc = command->func;
1079 }
1080 void DPSOFTRAST_DepthFunc(int func)
1081 {
1082         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1083         command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1084         command->func = func;
1085 }
1086
1087 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1088 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1089 {
1090         thread->depthrange[0] = command->nearval;
1091         thread->depthrange[1] = command->farval;
1092 }
1093 void DPSOFTRAST_DepthRange(float nearval, float farval)
1094 {
1095         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1096         command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1097         command->nearval = nearval;
1098         command->farval = farval;
1099 }
1100
1101 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1102 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1103 {
1104         thread->polygonoffset[0] = command->alongnormal;
1105         thread->polygonoffset[1] = command->intoview;
1106 }
1107 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1108 {
1109         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1110         command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1111         command->alongnormal = alongnormal;
1112         command->intoview = intoview;
1113 }
1114
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         dpsoftrast.cullface = mode;
1118 }
1119
1120 DEFCOMMAND(15, AlphaTest, int enable;)
1121 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 {
1123         thread->alphatest = command->enable;
1124 }
1125 void DPSOFTRAST_AlphaTest(int enable)
1126 {
1127         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1128         command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1142         command->func = func;
1143         command->ref = ref;
1144 }
1145
1146 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 {
1148         dpsoftrast.color[0] = r;
1149         dpsoftrast.color[1] = g;
1150         dpsoftrast.color[2] = b;
1151         dpsoftrast.color[3] = a;
1152 }
1153
1154 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 {
1156         int outstride = blockwidth * 4;
1157         int instride = dpsoftrast.fb_width * 4;
1158         int bx1 = blockx;
1159         int by1 = blocky;
1160         int bx2 = blockx + blockwidth;
1161         int by2 = blocky + blockheight;
1162         int bw;
1163         int bh;
1164         int x;
1165         int y;
1166         unsigned char *inpixels;
1167         unsigned char *b;
1168         unsigned char *o;
1169         DPSOFTRAST_Flush();
1170         if (bx1 < 0) bx1 = 0;
1171         if (by1 < 0) by1 = 0;
1172         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1173         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1174         bw = bx2 - bx1;
1175         bh = by2 - by1;
1176         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1177         if (dpsoftrast.bigendian)
1178         {
1179                 for (y = by1;y < by2;y++)
1180                 {
1181                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1183                         for (x = bx1;x < bx2;x++)
1184                         {
1185                                 o[0] = b[3];
1186                                 o[1] = b[2];
1187                                 o[2] = b[1];
1188                                 o[3] = b[0];
1189                                 o += 4;
1190                                 b += 4;
1191                         }
1192                 }
1193         }
1194         else
1195         {
1196                 for (y = by1;y < by2;y++)
1197                 {
1198                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1199                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1200                         memcpy(o, b, bw*4);
1201                 }
1202         }
1203
1204 }
1205 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 {
1207         int tx1 = tx;
1208         int ty1 = ty;
1209         int tx2 = tx + width;
1210         int ty2 = ty + height;
1211         int sx1 = sx;
1212         int sy1 = sy;
1213         int sx2 = sx + width;
1214         int sy2 = sy + height;
1215         int swidth;
1216         int sheight;
1217         int twidth;
1218         int theight;
1219         int sw;
1220         int sh;
1221         int tw;
1222         int th;
1223         int y;
1224         unsigned int *spixels;
1225         unsigned int *tpixels;
1226         DPSOFTRAST_Texture *texture;
1227         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1228         if (mip < 0 || mip >= texture->mipmaps) return;
1229         DPSOFTRAST_Flush();
1230         spixels = dpsoftrast.fb_colorpixels[0];
1231         swidth = dpsoftrast.fb_width;
1232         sheight = dpsoftrast.fb_height;
1233         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1234         twidth = texture->mipmap[mip][2];
1235         theight = texture->mipmap[mip][3];
1236         if (tx1 < 0) tx1 = 0;
1237         if (ty1 < 0) ty1 = 0;
1238         if (tx2 > twidth) tx2 = twidth;
1239         if (ty2 > theight) ty2 = theight;
1240         if (sx1 < 0) sx1 = 0;
1241         if (sy1 < 0) sy1 = 0;
1242         if (sx2 > swidth) sx2 = swidth;
1243         if (sy2 > sheight) sy2 = sheight;
1244         tw = tx2 - tx1;
1245         th = ty2 - ty1;
1246         sw = sx2 - sx1;
1247         sh = sy2 - sy1;
1248         if (tw > sw) tw = sw;
1249         if (th > sh) th = sh;
1250         if (tw < 1 || th < 1)
1251                 return;
1252         for (y = 0;y < th;y++)
1253                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1254         if (texture->mipmaps > 1)
1255                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 }
1257
1258 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1259 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1260 {
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1281         command->unitnum = unitnum;
1282         command->texture = texture;
1283
1284         dpsoftrast.texbound[unitnum] = texture;
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->opcode = DPSOFTRAST_OPCODE_SetShader;
1321         command->mode = mode;
1322         command->permutation = permutation;
1323
1324         dpsoftrast.shader_mode = mode;
1325         dpsoftrast.shader_permutation = permutation;
1326 }
1327
1328 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1329 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 {
1331         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 }
1333 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 {
1335         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1336         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1337         command->index = index;
1338         command->val[0] = v0;
1339         command->val[1] = v1;
1340         command->val[2] = v2;
1341         command->val[3] = v3;
1342
1343         dpsoftrast.uniform4f[index*4+0] = v0;
1344         dpsoftrast.uniform4f[index*4+1] = v1;
1345         dpsoftrast.uniform4f[index*4+2] = v2;
1346         dpsoftrast.uniform4f[index*4+3] = v3;
1347 }
1348 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1349 {
1350         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1351         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1352         command->index = index;
1353         memcpy(command->val, v, sizeof(command->val));
1354
1355         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1356 }
1357
1358 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1359 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1360 {
1361         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1362 }
1363 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 {
1365 #ifdef SSE2_PRESENT
1366         int i, index;
1367         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1368         {
1369                 __m128 m0, m1, m2, m3;
1370                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1371                 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1372                 command->index = index;
1373                 if (((size_t)v)&(ALIGN_SIZE-1))
1374                 {
1375                         m0 = _mm_loadu_ps(v);
1376                         m1 = _mm_loadu_ps(v+4);
1377                         m2 = _mm_loadu_ps(v+8);
1378                         m3 = _mm_loadu_ps(v+12);
1379                 }
1380                 else
1381                 {
1382                         m0 = _mm_load_ps(v);
1383                         m1 = _mm_load_ps(v+4);
1384                         m2 = _mm_load_ps(v+8);
1385                         m3 = _mm_load_ps(v+12);
1386                 }
1387                 if (transpose)
1388                 {
1389                         __m128 t0, t1, t2, t3;
1390                         t0 = _mm_unpacklo_ps(m0, m1);
1391                         t1 = _mm_unpacklo_ps(m2, m3);
1392                         t2 = _mm_unpackhi_ps(m0, m1);
1393                         t3 = _mm_unpackhi_ps(m2, m3);
1394                         m0 = _mm_movelh_ps(t0, t1);
1395                         m1 = _mm_movehl_ps(t1, t0);
1396                         m2 = _mm_movelh_ps(t2, t3);
1397                         m3 = _mm_movehl_ps(t3, t2);                     
1398                 }
1399                 _mm_store_ps(command->val, m0);
1400                 _mm_store_ps(command->val+4, m1);
1401                 _mm_store_ps(command->val+8, m2);
1402                 _mm_store_ps(command->val+12, m3);
1403                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1404                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1405                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1406                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407         }
1408 #endif
1409 }
1410
1411 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1412 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1413 {
1414         thread->uniform1i[command->index] = command->val;
1415 }
1416 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1417 {
1418         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1419         command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1420         command->index = index;
1421         command->val = i0;
1422
1423         dpsoftrast.uniform1i[command->index] = i0;
1424 }
1425
1426 #ifdef SSE2_PRESENT
1427 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1428 {
1429         float *end = dst + size*4;
1430         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1431         {
1432                 while (dst < end)
1433                 {
1434                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1435                         dst += 4;
1436                         src += stride;
1437                 }
1438         }
1439         else
1440         {
1441                 while (dst < end)
1442                 {
1443                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1444                         dst += 4;
1445                         src += stride;
1446                 }
1447         }
1448 }
1449
1450 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1451 {
1452         float *end = dst + size*4;
1453         if (stride == sizeof(float[3]))
1454         {
1455                 float *end4 = dst + (size&~3)*4;        
1456                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1457                 {
1458                         while (dst < end4)
1459                         {
1460                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1461                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1462                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1465                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1466                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1468                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1469                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dst += 16;
1474                                 src += 4*sizeof(float[3]);
1475                         }
1476                 }
1477                 else
1478                 {
1479                         while (dst < end4)
1480                         {
1481                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1482                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1483                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1486                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1487                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1489                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1490                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1493                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1494                                 dst += 16;
1495                                 src += 4*sizeof(float[3]);
1496                         }
1497                 }
1498         }
1499         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1500         {
1501                 while (dst < end)
1502                 {
1503                         __m128 v = _mm_loadu_ps((const float *)src);
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507                         _mm_store_ps(dst, v);
1508                         dst += 4;
1509                         src += stride;
1510                 }
1511         }
1512         else
1513         {
1514                 while (dst < end)
1515                 {
1516                         __m128 v = _mm_load_ps((const float *)src);
1517                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1518                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1519                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1520                         _mm_store_ps(dst, v);
1521                         dst += 4;
1522                         src += stride;
1523                 }
1524         }
1525 }
1526
1527 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1528 {
1529         float *end = dst + size*4;
1530         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1531         if (stride == sizeof(float[2]))
1532         {
1533                 float *end2 = dst + (size&~1)*4;
1534                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1535                 {
1536                         while (dst < end2)
1537                         {
1538                                 __m128 v = _mm_loadu_ps((const float *)src);
1539                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1540                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1541                                 dst += 8;
1542                                 src += 2*sizeof(float[2]);
1543                         }
1544                 }
1545                 else
1546                 {
1547                         while (dst < end2)
1548                         {
1549                                 __m128 v = _mm_load_ps((const float *)src);
1550                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1551                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1552                                 dst += 8;
1553                                 src += 2*sizeof(float[2]);
1554                         }
1555                 }
1556         }
1557         while (dst < end)
1558         {
1559                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560                 dst += 4;
1561                 src += stride;
1562         }
1563 }
1564
1565 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1566 {
1567         float *end = dst + size*4;
1568         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1569         if (stride == sizeof(unsigned char[4]))
1570         {
1571                 float *end4 = dst + (size&~3)*4;
1572                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1573                 {
1574                         while (dst < end4)
1575                         {
1576                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1581                                 dst += 16;
1582                                 src += 4*sizeof(unsigned char[4]);
1583                         }
1584                 }
1585                 else
1586                 {
1587                 while (dst < end4)
1588                 {
1589                     __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1590                     _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1591                     _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1592                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1593                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594                     dst += 16;
1595                     src += 4*sizeof(unsigned char[4]);
1596                 }
1597                 }
1598         }
1599         while (dst < end)
1600         {
1601                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1602                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603                 dst += 4;
1604                 src += stride;
1605         }
1606 }
1607
1608 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1609 {
1610         float *end = dst + 4*size;
1611         __m128 v = _mm_loadu_ps(src);
1612         while (dst < end)
1613         {
1614                 _mm_store_ps(dst, v);
1615                 dst += 4;
1616         }
1617 }
1618 #endif
1619
1620 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1621 {
1622 #ifdef SSE2_PRESENT
1623         int i;
1624         int j;
1625         int stride;
1626         const float *v;
1627         float *p;
1628         float *data;
1629         const unsigned char *b;
1630         dpsoftrast.numvertices = numvertices;
1631         if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1632         {
1633                 if (dpsoftrast.maxvertices < 4096)
1634                         dpsoftrast.maxvertices = 4096;
1635                 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1636                         dpsoftrast.maxvertices *= 2;
1637                 if (dpsoftrast.in_array4f[0])
1638                         MM_FREE(dpsoftrast.in_array4f[0]);
1639                 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1640                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1641                         dpsoftrast.in_array4f[i] = data;
1642                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1643                         dpsoftrast.post_array4f[i] = data;
1644                 dpsoftrast.screencoord4f = data;
1645                 data += dpsoftrast.maxvertices * 4;
1646         }
1647         stride = dpsoftrast.stride_vertex;
1648         v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1649         p = dpsoftrast.in_array4f[0];
1650         DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1651         if (needcolors)
1652         {
1653                 if (dpsoftrast.pointer_color4f)
1654                 {
1655                         stride = dpsoftrast.stride_color;
1656                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1657                         p = dpsoftrast.in_array4f[1];
1658                         DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1659                 }
1660                 else if (dpsoftrast.pointer_color4ub)
1661                 {
1662                         stride = dpsoftrast.stride_color;
1663                         b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1664                         p = dpsoftrast.in_array4f[1];
1665                         DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1666                 }
1667                 else
1668                 {
1669                         p = dpsoftrast.in_array4f[1];
1670                         DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1671                 }
1672         }
1673         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1674         {
1675                 if (dpsoftrast.pointer_texcoordf[j])
1676                 {
1677                         stride = dpsoftrast.stride_texcoord[j];
1678                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1679                         p = dpsoftrast.in_array4f[j+2];
1680                         switch(dpsoftrast.components_texcoord[j])
1681                         {
1682                         case 2:
1683                                 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1684                                 break;
1685                         case 3:
1686                                 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1687                                 break;
1688                         case 4:
1689                                 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1690                                 break;
1691                         }
1692                 }
1693         }
1694 #endif
1695 }
1696
1697 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1698 {
1699 #ifdef SSE2_PRESENT
1700         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1701         __m128 m0, m1, m2, m3;
1702         float *end = out4f + numitems*4;
1703         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1704         {
1705                 // fast case for identity matrix
1706                 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1707                 return;
1708         }
1709         m0 = _mm_loadu_ps(inmatrix16f);
1710         m1 = _mm_loadu_ps(inmatrix16f + 4);
1711         m2 = _mm_loadu_ps(inmatrix16f + 8);
1712         m3 = _mm_loadu_ps(inmatrix16f + 12);
1713         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1714         {
1715                 while (out4f < end)
1716                 {
1717                         __m128 v = _mm_loadu_ps(in4f);
1718                         _mm_store_ps(out4f,
1719                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1720                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1721                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1722                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1723                         out4f += 4;
1724                         in4f += 4;
1725                 }
1726         }
1727         else
1728         {
1729                 while (out4f < end)
1730                 {
1731                         __m128 v = _mm_load_ps(in4f);
1732                         _mm_store_ps(out4f,
1733                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1734                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1735                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1736                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1737                         out4f += 4;
1738                         in4f += 4;
1739                 }
1740         }
1741 #endif
1742 }
1743
1744 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1745 {
1746         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1747 }
1748
1749 #ifdef SSE2_PRESENT
1750 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1751 {
1752         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1753         __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1754         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1755         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1756         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1757         return v;
1758 }
1759 #endif
1760
1761 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1762 {
1763 #ifdef SSE2_PRESENT
1764         float *end = out4f + numitems*4;
1765         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1766         while (out4f < end)
1767         {
1768                 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1769                 _mm_store_ps(out4f, v);
1770                 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1771                 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1772                 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1773                 in4f += 4;
1774                 out4f += 4;
1775                 screen4f += 4;
1776         }
1777 #endif
1778 }
1779
1780 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1781 {
1782 #ifdef SSE2_PRESENT
1783         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1784         __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1785         float *end = out4f + numitems*4;
1786         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1787         {
1788                 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1789                 return;
1790         }
1791         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1792         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1793         m0 = _mm_loadu_ps(inmatrix16f);
1794         m1 = _mm_loadu_ps(inmatrix16f + 4);
1795         m2 = _mm_loadu_ps(inmatrix16f + 8);
1796         m3 = _mm_loadu_ps(inmatrix16f + 12);
1797         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1798         {
1799                 while (out4f < end)
1800                 {
1801                         __m128 v = _mm_loadu_ps(in4f), w;
1802                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1803                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1804                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1805                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1806                         _mm_store_ps(out4f, v);
1807                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1808                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1809                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1810                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1811                         in4f += 4;
1812                         out4f += 4;
1813                         screen4f += 4;
1814                 }
1815         }
1816         else
1817         {
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_load_ps(in4f), w;
1821                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1822                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1823                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1824                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1825                         _mm_store_ps(out4f, v);
1826                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1827                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1828                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1829                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1830                         in4f += 4;
1831                         out4f += 4;
1832                         screen4f += 4;
1833                 }
1834         }
1835 #endif
1836 }
1837
1838 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1839 {
1840         int x;
1841         int startx = span->startx;
1842         int endx = span->endx;
1843         float wslope = triangle->w[0];
1844         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1845         float endz = 1.0f / (w + wslope * startx);
1846         for (x = startx;x < endx;)
1847         {
1848                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1849                 float z = endz, dz;
1850                 if(nextsub >= endx) nextsub = endsub = endx-1;
1851                 endz = 1.0f / (w + wslope * nextsub);
1852                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1853                 for (; x <= endsub; x++, z += dz)
1854                         zf[x] = z;
1855         }
1856 }
1857
1858 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1859 {
1860         int x;
1861         int startx = span->startx;
1862         int endx = span->endx;
1863         int d[4];
1864         float a, b;
1865         unsigned char * RESTRICT pixelmask = span->pixelmask;
1866         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1867         if (!pixel)
1868                 return;
1869         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1870         // handle alphatest now (this affects depth writes too)
1871         if (thread->alphatest)
1872                 for (x = startx;x < endx;x++)
1873                         if (in4f[x*4+3] < 0.5f)
1874                                 pixelmask[x] = false;
1875         // FIXME: this does not handle bigendian
1876         switch(thread->fb_blendmode)
1877         {
1878         case DPSOFTRAST_BLENDMODE_OPAQUE:
1879                 for (x = startx;x < endx;x++)
1880                 {
1881                         if (!pixelmask[x])
1882                                 continue;
1883                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1884                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1885                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1886                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1887                         pixel[x*4+0] = d[0];
1888                         pixel[x*4+1] = d[1];
1889                         pixel[x*4+2] = d[2];
1890                         pixel[x*4+3] = d[3];
1891                 }
1892                 break;
1893         case DPSOFTRAST_BLENDMODE_ALPHA:
1894                 for (x = startx;x < endx;x++)
1895                 {
1896                         if (!pixelmask[x])
1897                                 continue;
1898                         a = in4f[x*4+3] * 255.0f;
1899                         b = 1.0f - in4f[x*4+3];
1900                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1901                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1902                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1903                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1904                         pixel[x*4+0] = d[0];
1905                         pixel[x*4+1] = d[1];
1906                         pixel[x*4+2] = d[2];
1907                         pixel[x*4+3] = d[3];
1908                 }
1909                 break;
1910         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1911                 for (x = startx;x < endx;x++)
1912                 {
1913                         if (!pixelmask[x])
1914                                 continue;
1915                         a = in4f[x*4+3] * 255.0f;
1916                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1917                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1918                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1919                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1920                         pixel[x*4+0] = d[0];
1921                         pixel[x*4+1] = d[1];
1922                         pixel[x*4+2] = d[2];
1923                         pixel[x*4+3] = d[3];
1924                 }
1925                 break;
1926         case DPSOFTRAST_BLENDMODE_ADD:
1927                 for (x = startx;x < endx;x++)
1928                 {
1929                         if (!pixelmask[x])
1930                                 continue;
1931                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1932                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1933                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1934                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1935                         pixel[x*4+0] = d[0];
1936                         pixel[x*4+1] = d[1];
1937                         pixel[x*4+2] = d[2];
1938                         pixel[x*4+3] = d[3];
1939                 }
1940                 break;
1941         case DPSOFTRAST_BLENDMODE_INVMOD:
1942                 for (x = startx;x < endx;x++)
1943                 {
1944                         if (!pixelmask[x])
1945                                 continue;
1946                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1947                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1948                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1949                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1950                         pixel[x*4+0] = d[0];
1951                         pixel[x*4+1] = d[1];
1952                         pixel[x*4+2] = d[2];
1953                         pixel[x*4+3] = d[3];
1954                 }
1955                 break;
1956         case DPSOFTRAST_BLENDMODE_MUL:
1957                 for (x = startx;x < endx;x++)
1958                 {
1959                         if (!pixelmask[x])
1960                                 continue;
1961                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1962                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1963                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1964                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1965                         pixel[x*4+0] = d[0];
1966                         pixel[x*4+1] = d[1];
1967                         pixel[x*4+2] = d[2];
1968                         pixel[x*4+3] = d[3];
1969                 }
1970                 break;
1971         case DPSOFTRAST_BLENDMODE_MUL2:
1972                 for (x = startx;x < endx;x++)
1973                 {
1974                         if (!pixelmask[x])
1975                                 continue;
1976                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1977                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1978                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1979                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1980                         pixel[x*4+0] = d[0];
1981                         pixel[x*4+1] = d[1];
1982                         pixel[x*4+2] = d[2];
1983                         pixel[x*4+3] = d[3];
1984                 }
1985                 break;
1986         case DPSOFTRAST_BLENDMODE_SUBALPHA:
1987                 for (x = startx;x < endx;x++)
1988                 {
1989                         if (!pixelmask[x])
1990                                 continue;
1991                         a = in4f[x*4+3] * -255.0f;
1992                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1993                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1994                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1995                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         }
2020 }
2021
2022 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2023 {
2024 #ifdef SSE2_PRESENT
2025         int x;
2026         int startx = span->startx;
2027         int endx = span->endx;
2028         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2029         unsigned char * RESTRICT pixelmask = span->pixelmask;
2030         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2031         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2032         if (!pixel)
2033                 return;
2034         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2035         pixeli += span->y * dpsoftrast.fb_width + span->x;
2036         // handle alphatest now (this affects depth writes too)
2037         if (thread->alphatest)
2038                 for (x = startx;x < endx;x++)
2039                         if (in4ub[x*4+3] < 0.5f)
2040                                 pixelmask[x] = false;
2041         // FIXME: this does not handle bigendian
2042         switch(thread->fb_blendmode)
2043         {
2044         case DPSOFTRAST_BLENDMODE_OPAQUE:
2045                 for (x = startx;x + 4 <= endx;)
2046                 {
2047                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2048                         {
2049                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2050                                 x += 4;
2051                         }
2052                         else
2053                         {
2054                                 if (pixelmask[x])
2055                                         pixeli[x] = ini[x];
2056                                 x++;
2057                         }
2058                 }
2059                 for (;x < endx;x++)
2060                         if (pixelmask[x])
2061                                 pixeli[x] = ini[x];
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_ALPHA:
2064         #define FINISHBLEND(blend2, blend1) \
2065                 for (x = startx;x + 2 <= endx;x += 2) \
2066                 { \
2067                         __m128i src, dst; \
2068                         switch (*(const unsigned short*)&pixelmask[x]) \
2069                         { \
2070                         case 0x0101: \
2071                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2072                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2073                                 blend2; \
2074                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2075                                 continue; \
2076                         case 0x0100: \
2077                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2078                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2079                                 blend1; \
2080                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2081                                 continue; \
2082                         case 0x0001: \
2083                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2084                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2085                                 blend1; \
2086                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2087                                 continue; \
2088                         } \
2089                         break; \
2090                 } \
2091                 for(;x < endx; x++) \
2092                 { \
2093                         __m128i src, dst; \
2094                         if (!pixelmask[x]) \
2095                                 continue; \
2096                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2097                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2098                         blend1; \
2099                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2100                 }
2101
2102                 FINISHBLEND({
2103                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2104                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2105                 }, {
2106                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2107                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2108                 });
2109                 break;
2110         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2111                 FINISHBLEND({
2112                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2113                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2114                 }, {
2115                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2116                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2117                 });
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_ADD:
2120                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2121                 break;
2122         case DPSOFTRAST_BLENDMODE_INVMOD:
2123                 FINISHBLEND({
2124                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2125                 }, {
2126                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2127                 });
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_MUL:
2130                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_MUL2:
2133                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2134                 break;
2135         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2136                 FINISHBLEND({
2137                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2138                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2139                 }, {
2140                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2141                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2142                 });
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2145                 FINISHBLEND({
2146                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2147                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2148                 }, {
2149                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2150                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2151                 });
2152                 break;
2153         }
2154 #endif
2155 }
2156
2157 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2158 {
2159         int x;
2160         int startx = span->startx;
2161         int endx = span->endx;
2162         int flags;
2163         float c[4];
2164         float data[4];
2165         float slope[4];
2166         float tc[2], endtc[2];
2167         float tcscale[2];
2168         unsigned int tci[2];
2169         unsigned int tci1[2];
2170         unsigned int tcimin[2];
2171         unsigned int tcimax[2];
2172         int tciwrapmask[2];
2173         int tciwidth;
2174         int filter;
2175         int mip;
2176         const unsigned char * RESTRICT pixelbase;
2177         const unsigned char * RESTRICT pixel[4];
2178         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2179         // if no texture is bound, just fill it with white
2180         if (!texture)
2181         {
2182                 for (x = startx;x < endx;x++)
2183                 {
2184                         out4f[x*4+0] = 1.0f;
2185                         out4f[x*4+1] = 1.0f;
2186                         out4f[x*4+2] = 1.0f;
2187                         out4f[x*4+3] = 1.0f;
2188                 }
2189                 return;
2190         }
2191         mip = triangle->mip[texunitindex];
2192         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2193         // if this mipmap of the texture is 1 pixel, just fill it with that color
2194         if (texture->mipmap[mip][1] == 4)
2195         {
2196                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2197                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2198                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2199                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2200                 for (x = startx;x < endx;x++)
2201                 {
2202                         out4f[x*4+0] = c[0];
2203                         out4f[x*4+1] = c[1];
2204                         out4f[x*4+2] = c[2];
2205                         out4f[x*4+3] = c[3];
2206                 }
2207                 return;
2208         }
2209         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2210         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2211         flags = texture->flags;
2212         tcscale[0] = texture->mipmap[mip][2];
2213         tcscale[1] = texture->mipmap[mip][3];
2214         tciwidth = texture->mipmap[mip][2];
2215         tcimin[0] = 0;
2216         tcimin[1] = 0;
2217         tcimax[0] = texture->mipmap[mip][2]-1;
2218         tcimax[1] = texture->mipmap[mip][3]-1;
2219         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2220         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2221         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2222         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2223         for (x = startx;x < endx;)
2224         {
2225                 unsigned int subtc[2];
2226                 unsigned int substep[2];
2227                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2228                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2229                 if(nextsub >= endx)
2230                 {
2231                         nextsub = endsub = endx-1;      
2232                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2233                 }
2234                 tc[0] = endtc[0];
2235                 tc[1] = endtc[1];
2236                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2237                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2238                 substep[0] = (endtc[0] - tc[0]) * subscale;
2239                 substep[1] = (endtc[1] - tc[1]) * subscale;
2240                 subtc[0] = tc[0] * (1<<16);
2241                 subtc[1] = tc[1] * (1<<16);
2242                 if(filter)
2243                 {
2244                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2245                         {
2246                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2247                                 {
2248                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2249                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2250                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2251                                         tci[0] = subtc[0]>>16;
2252                                         tci[1] = subtc[1]>>16;
2253                                         tci1[0] = tci[0] + 1;
2254                                         tci1[1] = tci[1] + 1;
2255                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2256                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2257                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2258                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2259                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2260                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2261                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2262                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2263                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2264                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2265                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2266                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2267                                         out4f[x*4+0] = c[0];
2268                                         out4f[x*4+1] = c[1];
2269                                         out4f[x*4+2] = c[2];
2270                                         out4f[x*4+3] = c[3];
2271                                 }
2272                         }
2273                         else
2274                         {
2275                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2276                                 {
2277                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2278                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2279                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2280                                         tci[0] = subtc[0]>>16;
2281                                         tci[1] = subtc[1]>>16;
2282                                         tci1[0] = tci[0] + 1;
2283                                         tci1[1] = tci[1] + 1;
2284                                         tci[0] &= tciwrapmask[0];
2285                                         tci[1] &= tciwrapmask[1];
2286                                         tci1[0] &= tciwrapmask[0];
2287                                         tci1[1] &= tciwrapmask[1];
2288                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2289                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2290                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2291                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2292                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2293                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2294                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2295                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2296                                         out4f[x*4+0] = c[0];
2297                                         out4f[x*4+1] = c[1];
2298                                         out4f[x*4+2] = c[2];
2299                                         out4f[x*4+3] = c[3];
2300                                 }
2301                         }
2302                 }
2303                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2306                         {
2307                                 tci[0] = subtc[0]>>16;
2308                                 tci[1] = subtc[1]>>16;
2309                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2310                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2311                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2312                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2313                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2314                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2315                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2316                                 out4f[x*4+0] = c[0];
2317                                 out4f[x*4+1] = c[1];
2318                                 out4f[x*4+2] = c[2];
2319                                 out4f[x*4+3] = c[3];
2320                         }
2321                 }
2322                 else
2323                 {
2324                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2325                         {
2326                                 tci[0] = subtc[0]>>16;
2327                                 tci[1] = subtc[1]>>16;
2328                                 tci[0] &= tciwrapmask[0];
2329                                 tci[1] &= tciwrapmask[1];
2330                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2331                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2332                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2333                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2334                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2335                                 out4f[x*4+0] = c[0];
2336                                 out4f[x*4+1] = c[1];
2337                                 out4f[x*4+2] = c[2];
2338                                 out4f[x*4+3] = c[3];
2339                         }
2340                 }
2341         }
2342 }
2343
2344 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2345 {
2346 #ifdef SSE2_PRESENT
2347         int x;
2348         int startx = span->startx;
2349         int endx = span->endx;
2350         int flags;
2351         __m128 data, slope, tcscale;
2352         __m128i tcsize, tcmask, tcoffset, tcmax;
2353         __m128 tc, endtc;
2354         __m128i subtc, substep, endsubtc;
2355         int filter;
2356         int mip;
2357         unsigned int *outi = (unsigned int *)out4ub;
2358         const unsigned char * RESTRICT pixelbase;
2359         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2360         // if no texture is bound, just fill it with white
2361         if (!texture)
2362         {
2363                 memset(out4ub + startx*4, 255, span->length*4);
2364                 return;
2365         }
2366         mip = triangle->mip[texunitindex];
2367         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2368         // if this mipmap of the texture is 1 pixel, just fill it with that color
2369         if (texture->mipmap[mip][1] == 4)
2370         {
2371                 unsigned int k = *((const unsigned int *)pixelbase);
2372                 for (x = startx;x < endx;x++)
2373                         outi[x] = k;
2374                 return;
2375         }
2376         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2377         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2378         flags = texture->flags;
2379         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2380         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2381         tcscale = _mm_cvtepi32_ps(tcsize);
2382         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2383         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2384         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2385         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2386         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2387         tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);  
2388         for (x = startx;x < endx;)
2389         {
2390                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2391                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2392                 if(nextsub >= endx)
2393                 {
2394                         nextsub = endsub = endx-1;
2395                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2396                 }       
2397                 tc = endtc;
2398                 subtc = endsubtc;
2399                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2400                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2401                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2402                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2403                 substep = _mm_slli_epi32(substep, 1);
2404                 if (filter)
2405                 {
2406                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2407                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2408                         {
2409                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2410                                 {
2411                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2412                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
2413                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2414                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2415                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
2416                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
2417                                         fracm = _mm_srli_epi16(subtc, 1);
2418                                         pix1 = _mm_add_epi16(pix1,
2419                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2420                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2421                                         pix3 = _mm_add_epi16(pix3,
2422                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2423                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2424                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2425                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2426                                         pix2 = _mm_add_epi16(pix2,
2427                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2428                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2429                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2430                                 }
2431                                 if (x <= endsub)
2432                                 {
2433                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2434                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
2435                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2436                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2437                                         fracm = _mm_srli_epi16(subtc, 1);
2438                                         pix1 = _mm_add_epi16(pix1,
2439                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2440                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2441                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2442                                         pix1 = _mm_add_epi16(pix1,
2443                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2444                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2445                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2446                                         x++;
2447                                 }
2448                         }
2449                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2450                         {
2451                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2452                                 {
2453                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2454                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2455                                         tci = _mm_madd_epi16(tci, tcoffset);
2456                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2457                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2458                                                                                         _mm_setzero_si128());
2459                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2460                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2461                                                                                         _mm_setzero_si128());
2462                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2463                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2464                                         tci = _mm_madd_epi16(tci, tcoffset);
2465                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2466                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2467                                                                                         _mm_setzero_si128());
2468                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2469                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2470                                                                                         _mm_setzero_si128());
2471                                         fracm = _mm_srli_epi16(subtc, 1);
2472                                         pix1 = _mm_add_epi16(pix1,
2473                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2474                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2475                                         pix3 = _mm_add_epi16(pix3,
2476                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2477                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2478                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2479                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2480                                         pix2 = _mm_add_epi16(pix2,
2481                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2482                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2483                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2484                                 }
2485                                 if (x <= endsub)
2486                                 {
2487                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2488                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2489                                         tci = _mm_madd_epi16(tci, tcoffset);
2490                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2491                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2492                                                                                         _mm_setzero_si128());
2493                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2494                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2495                                                                                         _mm_setzero_si128());
2496                                         fracm = _mm_srli_epi16(subtc, 1);
2497                                         pix1 = _mm_add_epi16(pix1,
2498                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2499                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2500                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2501                                         pix1 = _mm_add_epi16(pix1,
2502                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2503                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2504                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2505                                         x++;
2506                                 }
2507                         }
2508                         else
2509                         {
2510                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2511                                 {
2512                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2513                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2514                                         tci = _mm_madd_epi16(tci, tcoffset);
2515                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2516                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2517                                                                                         _mm_setzero_si128());
2518                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2519                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2520                                                                                         _mm_setzero_si128());
2521                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2522                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2523                                         tci = _mm_madd_epi16(tci, tcoffset);
2524                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2525                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2526                                                                                         _mm_setzero_si128());
2527                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2528                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2529                                                                                         _mm_setzero_si128());
2530                                         fracm = _mm_srli_epi16(subtc, 1);
2531                                         pix1 = _mm_add_epi16(pix1,
2532                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2533                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2534                                         pix3 = _mm_add_epi16(pix3,
2535                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2536                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2537                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2538                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2539                                         pix2 = _mm_add_epi16(pix2,
2540                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2541                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2542                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2543                                 }
2544                                 if (x <= endsub)
2545                                 {
2546                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2547                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2548                                         tci = _mm_madd_epi16(tci, tcoffset);
2549                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2550                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2551                                                                                         _mm_setzero_si128());
2552                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2553                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2554                                                                                         _mm_setzero_si128());
2555                                         fracm = _mm_srli_epi16(subtc, 1);
2556                                         pix1 = _mm_add_epi16(pix1,
2557                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2558                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2559                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2560                                         pix1 = _mm_add_epi16(pix1,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2563                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2564                                         x++;
2565                                 }
2566                         }
2567                 }
2568                 else
2569                 {
2570                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2571                         {
2572                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2573                                 {
2574                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax); 
2575                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2576                                         tci = _mm_madd_epi16(tci, tcoffset);
2577                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2578                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2579                                 }
2580                                 if (x <= endsub)
2581                                 {
2582                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2583                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2584                                         tci = _mm_madd_epi16(tci, tcoffset);
2585                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2586                                         x++;
2587                                 }
2588                         }
2589                         else
2590                         {
2591                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2592                                 {
2593                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
2594                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2595                                         tci = _mm_madd_epi16(tci, tcoffset);
2596                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2597                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2598                                 }
2599                                 if (x <= endsub)
2600                                 {
2601                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
2602                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2603                                         tci = _mm_madd_epi16(tci, tcoffset);
2604                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2605                                         x++;
2606                                 }
2607                         }
2608                 }
2609         }
2610 #endif
2611 }
2612
2613 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2614 {
2615         // TODO: IMPLEMENT
2616         memset(out4ub, 255, span->length*4);
2617 }
2618
2619 float DPSOFTRAST_SampleShadowmap(const float *vector)
2620 {
2621         // TODO: IMPLEMENT
2622         return 1.0f;
2623 }
2624
2625 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2626 {
2627         int x;
2628         int startx = span->startx;
2629         int endx = span->endx;
2630         float c[4];
2631         float data[4];
2632         float slope[4];
2633         float z;
2634         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2635         for (x = startx;x < endx;x++)
2636         {
2637                 z = zf[x];
2638                 c[0] = (data[0] + slope[0]*x) * z;
2639                 c[1] = (data[1] + slope[1]*x) * z;
2640                 c[2] = (data[2] + slope[2]*x) * z;
2641                 c[3] = (data[3] + slope[3]*x) * z;
2642                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2643                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2644                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2645                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2646         }
2647 }
2648
2649 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2650 {
2651         int x;
2652         int startx = span->startx;
2653         int endx = span->endx;
2654         float c[4];
2655         float data[4];
2656         float slope[4];
2657         float z;
2658         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2659         for (x = startx;x < endx;x++)
2660         {
2661                 z = zf[x];
2662                 c[0] = (data[0] + slope[0]*x) * z;
2663                 c[1] = (data[1] + slope[1]*x) * z;
2664                 c[2] = (data[2] + slope[2]*x) * z;
2665                 c[3] = (data[3] + slope[3]*x) * z;
2666                 out4f[x*4+0] = c[0];
2667                 out4f[x*4+1] = c[1];
2668                 out4f[x*4+2] = c[2];
2669                 out4f[x*4+3] = c[3];
2670         }
2671 }
2672
2673 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2674 {
2675         int x, startx = span->startx, endx = span->endx;
2676         float c[4], localcolor[4];
2677         localcolor[0] = subcolor[0];
2678         localcolor[1] = subcolor[1];
2679         localcolor[2] = subcolor[2];
2680         localcolor[3] = subcolor[3];
2681         for (x = startx;x < endx;x++)
2682         {
2683                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2684                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2685                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2686                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2687                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2688                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2689                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2690                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2691         }
2692 }
2693
2694 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2695 {
2696         int x, startx = span->startx, endx = span->endx;
2697         for (x = startx;x < endx;x++)
2698         {
2699                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2700                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2701                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2702                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2703         }
2704 }
2705
2706 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2707 {
2708         int x, startx = span->startx, endx = span->endx;
2709         for (x = startx;x < endx;x++)
2710         {
2711                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2712                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2713                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2714                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2715         }
2716 }
2717
2718 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2719 {
2720         int x, startx = span->startx, endx = span->endx;
2721         float a, b;
2722         for (x = startx;x < endx;x++)
2723         {
2724                 a = 1.0f - inb4f[x*4+3];
2725                 b = inb4f[x*4+3];
2726                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2727                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2728                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2729                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2730         }
2731 }
2732
2733 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2734 {
2735         int x, startx = span->startx, endx = span->endx;
2736         float localcolor[4], ilerp, lerp;
2737         localcolor[0] = color[0];
2738         localcolor[1] = color[1];
2739         localcolor[2] = color[2];
2740         localcolor[3] = color[3];
2741         ilerp = 1.0f - localcolor[3];
2742         lerp = localcolor[3];
2743         for (x = startx;x < endx;x++)
2744         {
2745                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2746                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2747                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2748                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2749         }
2750 }
2751
2752
2753
2754 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2755 {
2756 #ifdef SSE2_PRESENT
2757         int x;
2758         int startx = span->startx;
2759         int endx = span->endx;
2760         __m128 data, slope;
2761         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2762         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2763         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2764         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2765         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2766         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2767         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2768         {
2769                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2770                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2771                 data = _mm_add_ps(data, slope);
2772                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2773                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2774                 pix = _mm_mulhi_epu16(pix, mod);
2775                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2776         }
2777         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2778         {
2779                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2780                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2781                 mod = _mm_packs_epi32(mod, mod);
2782                 pix = _mm_mulhi_epu16(pix, mod);
2783                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2784         }
2785 #endif
2786 }
2787
2788 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2789 {
2790 #ifdef SSE2_PRESENT
2791         int x;
2792         int startx = span->startx;
2793         int endx = span->endx;
2794         __m128 data, slope;
2795         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2796         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2797         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2798         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2799         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2800         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2801         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2802         {
2803                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2804                 data = _mm_add_ps(data, slope);
2805                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2806                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2807                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2808         }
2809         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2810         {
2811                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2812                 pix = _mm_packs_epi32(pix, pix);
2813                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2814         }
2815 #endif
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2819 {
2820 #ifdef SSE2_PRESENT
2821         int x, startx = span->startx, endx = span->endx;
2822         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2823         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2824         for (x = startx;x+2 <= endx;x+=2)
2825         {
2826                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2827                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2828                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2829                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2830         }
2831         if(x < endx)
2832         {
2833                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2834                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2835                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2836                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2837         }
2838 #endif
2839 }
2840
2841 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2842 {
2843 #ifdef SSE2_PRESENT
2844         int x, startx = span->startx, endx = span->endx;
2845         for (x = startx;x+2 <= endx;x+=2)
2846         {
2847                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2848                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2849                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2850                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2851         }
2852         if(x < endx)
2853         {
2854                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2855                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2856                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2857                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2858         }
2859 #endif
2860 }
2861
2862 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2863 {
2864 #ifdef SSE2_PRESENT
2865         int x, startx = span->startx, endx = span->endx;
2866         for (x = startx;x+2 <= endx;x+=2)
2867         {
2868                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2869                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2870                 pix1 = _mm_add_epi16(pix1, pix2);
2871                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2872         }
2873         if(x < endx)
2874         {
2875                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2876                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2877                 pix1 = _mm_add_epi16(pix1, pix2);
2878                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2879         }
2880 #endif
2881 }
2882
2883 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2884 {
2885 #ifdef SSE2_PRESENT
2886         int x, startx = span->startx, endx = span->endx;
2887         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2888         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2889         for (x = startx;x+2 <= endx;x+=2)
2890         {
2891                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2892                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2893                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2894                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2895         }
2896         if(x < endx)
2897         {
2898                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2899                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2900                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2901                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2902         }
2903 #endif
2904 }
2905
2906 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2907 {
2908 #ifdef SSE2_PRESENT
2909         int x, startx = span->startx, endx = span->endx;
2910         for (x = startx;x+2 <= endx;x+=2)
2911         {
2912                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2913                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2914                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2915                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2916                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2917         }
2918         if(x < endx)
2919         {
2920                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2921                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2922                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2923                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2924                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2925         }
2926 #endif
2927 }
2928
2929 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2930 {
2931 #ifdef SSE2_PRESENT
2932         int x, startx = span->startx, endx = span->endx;
2933         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2934         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2935         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2936         for (x = startx;x+2 <= endx;x+=2)
2937         {
2938                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2939                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2940                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2941         }
2942         if(x < endx)
2943         {
2944                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2945                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2946                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2947         }
2948 #endif
2949 }
2950
2951
2952
2953 void DPSOFTRAST_VertexShader_Generic(void)
2954 {
2955         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2956         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
2957         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
2958         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2959                 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
2960 }
2961
2962 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
2963 {
2964         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2965         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2966         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2967         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2968         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
2969         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
2970         {
2971                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2972                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2973                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
2974                 {
2975                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2976                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2977                         {
2978                                 // multiply
2979                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2980                         }
2981                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2982                         {
2983                                 // add
2984                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2985                         }
2986                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2987                         {
2988                                 // alphablend
2989                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2990                         }