]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
nearest filtering fix
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifdef USE_THREADS
13 #include <SDL.h>
14 #include <SDL_thread.h>
15 #endif
16
17 #ifndef __cplusplus
18 typedef qboolean bool;
19 #endif
20
21 #define ALIGN_SIZE 16
22 #define ATOMIC_SIZE 32
23
24 #if defined(__GNUC__)
25 #define ALIGN(var) var __attribute__((__aligned__(16)))
26 #define ATOMIC(var) var __attribute__((__aligned__(32)))
27 #define MEMORY_BARRIER (_mm_sfence())
28 //(__sync_synchronize())
29 #elif defined(_MSC_VER)
30 #define ALIGN(var) __declspec(align(16)) var
31 #define ATOMIC(var) __declspec(align(32)) var
32 #define MEMORY_BARRIER (_mm_sfence())
33 //(MemoryBarrier())
34 #else
35 #define ALIGN(var) var
36 #define ATOMIC(var) var
37 #define MEMORY_BARRIER ((void)0)
38 #endif
39
40 #if !defined(USE_THREADS) || !defined(SSE2_PRESENT)
41 #undef MEMORY_BARRIER
42 #define MEMORY_BARRIER ((void)0)
43 #endif
44
45 #ifdef SSE2_PRESENT
46 #include <emmintrin.h>
47
48 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
49
50 static void *MM_CALLOC(size_t nmemb, size_t size)
51 {
52         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
53         if(ptr != NULL) memset(ptr, 0, nmemb*size);
54         return ptr;
55 }
56
57 #define MM_FREE _mm_free
58 #else
59 #define MM_MALLOC(size) malloc(size)
60 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
61 #define MM_FREE free
62 #endif
63
64 typedef enum DPSOFTRAST_ARRAY_e
65 {
66         DPSOFTRAST_ARRAY_POSITION,
67         DPSOFTRAST_ARRAY_COLOR,
68         DPSOFTRAST_ARRAY_TEXCOORD0,
69         DPSOFTRAST_ARRAY_TEXCOORD1,
70         DPSOFTRAST_ARRAY_TEXCOORD2,
71         DPSOFTRAST_ARRAY_TEXCOORD3,
72         DPSOFTRAST_ARRAY_TEXCOORD4,
73         DPSOFTRAST_ARRAY_TEXCOORD5,
74         DPSOFTRAST_ARRAY_TEXCOORD6,
75         DPSOFTRAST_ARRAY_TEXCOORD7,
76         DPSOFTRAST_ARRAY_TOTAL
77 }
78 DPSOFTRAST_ARRAY;
79
80 typedef struct DPSOFTRAST_Texture_s
81 {
82         int flags;
83         int width;
84         int height;
85         int depth;
86         int sides;
87         DPSOFTRAST_TEXTURE_FILTER filter;
88         int mipmaps;
89         int size;
90         unsigned char *bytes;
91         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
92 }
93 DPSOFTRAST_Texture;
94
95 #define COMMAND_SIZE ALIGN_SIZE
96 #define COMMAND_ALIGN(var) ALIGN(var)
97
98 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
99 {
100         int opcode;
101 }
102 DPSOFTRAST_Command);
103
104 enum { DPSOFTRAST_OPCODE_Reset = 0 };
105
106 #define DEFCOMMAND(opcodeval, name, fields) \
107         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
108         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
109         { \
110                 int opcode; \
111                 fields \
112         } DPSOFTRAST_Command_##name );
113
114 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
115
116 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
117 {
118         int freecommand;
119         int usedcommands;
120         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
121 }
122 DPSOFTRAST_State_Command_Pool);
123
124 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
125 {
126         int commandoffset;
127         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
128         int starty;
129         int endy;
130         int numpoints;
131         float w[3];
132         ALIGN(float coords[4][4]);
133         ALIGN(int ycoords[4]);
134         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
135 }
136 DPSOFTRAST_State_Triangle);
137
138 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
139         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
140         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
141                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
142                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
143 }
144 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
145         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
146         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
147         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
148         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
149         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
150         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
151         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
152         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
153 }
154                                         
155 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
156
157 typedef ALIGN(struct DPSOFTRAST_State_Span_s
158 {
159         int triangle; // triangle this span was generated by
160         int x; // framebuffer x coord
161         int y; // framebuffer y coord
162         int length; // pixel count
163         int startx; // usable range (according to pixelmask)
164         int endx; // usable range (according to pixelmask)
165         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
166 }
167 DPSOFTRAST_State_Span);
168
169 #define DPSOFTRAST_DRAW_MAXSPANS 1024
170
171 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
172 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
173
174 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
175 {
176         int freetriangle;
177         int usedtriangles;
178         ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
179 }
180 DPSOFTRAST_State_Triangle_Pool);
181
182 #define DPSOFTRAST_VALIDATE_FB 1
183 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
184 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
185 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
186
187 typedef enum DPSOFTRAST_BLENDMODE_e
188 {
189         DPSOFTRAST_BLENDMODE_OPAQUE,
190         DPSOFTRAST_BLENDMODE_ALPHA,
191         DPSOFTRAST_BLENDMODE_ADDALPHA,
192         DPSOFTRAST_BLENDMODE_ADD,
193         DPSOFTRAST_BLENDMODE_INVMOD,
194         DPSOFTRAST_BLENDMODE_MUL,
195         DPSOFTRAST_BLENDMODE_MUL2,
196         DPSOFTRAST_BLENDMODE_SUBALPHA,
197         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
198         DPSOFTRAST_BLENDMODE_TOTAL
199 }
200 DPSOFTRAST_BLENDMODE;
201
202 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
203 {
204 #ifdef USE_THREADS
205         SDL_Thread *thread;
206 #endif
207         int index;
208         
209         int colormask[4];
210         int blendfunc[2];
211         int blendsubtract;
212         int depthmask;
213         int depthtest;
214         int depthfunc;
215         int scissortest;
216         int alphatest;
217         int alphafunc;
218         float alphavalue;
219         int scissor[4];
220         int viewport[4];
221         float depthrange[2];
222         float polygonoffset[2];
223
224         int shader_mode;
225         int shader_permutation;
226
227         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
228         
229         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
230         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
231
232         // DPSOFTRAST_VALIDATE_ flags
233         int validate;
234
235         // derived values (DPSOFTRAST_VALIDATE_FB)
236         int fb_colormask;
237         int fb_clearscissor[4];
238
239         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
240         int fb_depthfunc;
241
242         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
243         int fb_blendmode;
244
245         ATOMIC(int commandoffset);
246         int triangleoffset;
247
248         bool waiting;
249 #ifdef USE_THREADS
250         SDL_cond *waitcond;
251 #endif
252
253         int numspans;
254         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
255 }
256 DPSOFTRAST_State_Thread);
257
258 typedef ATOMIC(struct DPSOFTRAST_State_s
259 {
260         int fb_width;
261         int fb_height;
262         unsigned int *fb_depthpixels;
263         unsigned int *fb_colorpixels[4];
264
265         int viewport[4];
266         ALIGN(float fb_viewportcenter[4]);
267         ALIGN(float fb_viewportscale[4]);
268
269         float color[4];
270         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
271         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
272
273         int cullface;
274
275         const float *pointer_vertex3f;
276         const float *pointer_color4f;
277         const unsigned char *pointer_color4ub;
278         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
279         int stride_vertex;
280         int stride_color;
281         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
282         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
283         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
284
285         int numvertices;
286         int maxvertices;
287         float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
288         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
289         float *screencoord4f;
290
291         int shader_mode;
292         int shader_permutation;
293
294         int texture_max;
295         int texture_end;
296         int texture_firstfree;
297         DPSOFTRAST_Texture *texture;
298
299         int bigendian;
300
301         // error reporting
302         const char *errorstring;
303
304         int numthreads;
305         DPSOFTRAST_State_Thread *threads;
306 #ifdef USE_THREADS
307         SDL_mutex *trianglemutex;
308         SDL_cond *trianglecond;
309 #endif
310
311         ATOMIC(int drawtriangle);
312
313         DPSOFTRAST_State_Command_Pool commandpool;
314         DPSOFTRAST_State_Triangle_Pool trianglepool;
315 }
316 DPSOFTRAST_State);
317
318 DPSOFTRAST_State dpsoftrast;
319
320 extern int dpsoftrast_test;
321
322 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
323 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
324 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
325 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
326 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
327
328 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
329 {
330         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
331         // and viewport projection values
332         int x1, x2;
333         int y1, y2;
334         x1 = thread->scissor[0];
335         x2 = thread->scissor[0] + thread->scissor[2];
336         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
337         y2 = dpsoftrast.fb_height - thread->scissor[1];
338         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
339         if (x1 < 0) x1 = 0;
340         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
341         if (y1 < 0) y1 = 0;
342         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
343         thread->fb_clearscissor[0] = x1;
344         thread->fb_clearscissor[1] = y1;
345         thread->fb_clearscissor[2] = x2 - x1;
346         thread->fb_clearscissor[3] = y2 - y1;
347 }
348
349 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
350 {
351         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
352 }
353
354 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
355 {
356         if (thread->blendsubtract)
357         {
358                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
359                 {
360                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
361                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
362                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
363                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
364                 }
365         }
366         else
367         {       
368             switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
369             {
370                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
371                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
372                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
373                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
374                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
375                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
376                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
377                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
378                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
379                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
380                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
381             }
382         }
383 }
384
385 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
386
387 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
388 {
389         mask &= thread->validate;
390         if (!mask)
391                 return;
392         if (mask & DPSOFTRAST_VALIDATE_FB)
393         {
394                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
395                 DPSOFTRAST_RecalcFB(thread);
396         }
397         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
398         {
399                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
400                 DPSOFTRAST_RecalcDepthFunc(thread);
401         }
402         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
403         {
404                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
405                 DPSOFTRAST_RecalcBlendFunc(thread);
406         }
407 }
408
409 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
410 {
411         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
412                 return &dpsoftrast.texture[index];
413         return NULL;
414 }
415
416 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
417 {
418         int w;
419         int h;
420         int d;
421         int size;
422         int s;
423         int texnum;
424         int mipmaps;
425         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
426         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
427         DPSOFTRAST_Texture *texture;
428         if (width*height*depth < 1)
429         {
430                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
431                 return 0;
432         }
433         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
434         {
435                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
436                 return 0;
437         }
438         switch(texformat)
439         {
440         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
441         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
442         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
443                 break;
444         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
445                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
446                 {
447                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
448                         return 0;
449                 }
450                 if (depth != 1)
451                 {
452                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
453                         return 0;
454                 }
455                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
456                 {
457                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
458                         return 0;
459                 }
460                 break;
461         }
462         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
463         {
464                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
465                 return 0;
466         }
467         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
468         {
469                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
470                 return 0;
471         }
472         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
473         {
474                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
475                 return 0;
476         }
477         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
478         {
479                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
480                 return 0;
481         }
482         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
483         {
484                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
485                 return 0;
486         }
487         DPSOFTRAST_Flush();
488         // find first empty slot in texture array
489         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
490                 if (!dpsoftrast.texture[texnum].bytes)
491                         break;
492         dpsoftrast.texture_firstfree = texnum + 1;
493         if (dpsoftrast.texture_max <= texnum)
494         {
495                 // expand texture array as needed
496                 if (dpsoftrast.texture_max < 1024)
497                         dpsoftrast.texture_max = 1024;
498                 else
499                         dpsoftrast.texture_max *= 2;
500                 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
501         }
502         if (dpsoftrast.texture_end <= texnum)
503                 dpsoftrast.texture_end = texnum + 1;
504         texture = &dpsoftrast.texture[texnum];
505         memset(texture, 0, sizeof(*texture));
506         texture->flags = flags;
507         texture->width = width;
508         texture->height = height;
509         texture->depth = depth;
510         texture->sides = sides;
511         w = width;
512         h = height;
513         d = depth;
514         size = 0;
515         mipmaps = 0;
516         w = width;
517         h = height;
518         d = depth;
519         for (;;)
520         {
521                 s = w * h * d * sides * 4;
522                 texture->mipmap[mipmaps][0] = size;
523                 texture->mipmap[mipmaps][1] = s;
524                 texture->mipmap[mipmaps][2] = w;
525                 texture->mipmap[mipmaps][3] = h;
526                 texture->mipmap[mipmaps][4] = d;
527                 size += s;
528                 mipmaps++;
529                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530                         break;
531                 if (w > 1) w >>= 1;
532                 if (h > 1) h >>= 1;
533                 if (d > 1) d >>= 1;
534         }
535         texture->mipmaps = mipmaps;
536         texture->size = size;
537
538         // allocate the pixels now
539         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
540
541         return texnum;
542 }
543 void DPSOFTRAST_Texture_Free(int index)
544 {
545         DPSOFTRAST_Texture *texture;
546         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
547         DPSOFTRAST_Flush();
548         if (texture->bytes)
549                 MM_FREE(texture->bytes);
550         texture->bytes = NULL;
551         memset(texture, 0, sizeof(*texture));
552         // adjust the free range and used range
553         if (dpsoftrast.texture_firstfree > index)
554                 dpsoftrast.texture_firstfree = index;
555         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
556                 dpsoftrast.texture_end--;
557 }
558 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
559 {
560         int i, x, y, z, w, layer0, layer1, row0, row1;
561         unsigned char *o, *i0, *i1, *i2, *i3;
562         DPSOFTRAST_Texture *texture;
563         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
564         if (texture->mipmaps <= 1)
565                 return;
566         for (i = 1;i < texture->mipmaps;i++)
567         {
568                 for (z = 0;z < texture->mipmap[i][4];z++)
569                 {
570                         layer0 = z*2;
571                         layer1 = z*2+1;
572                         if (layer1 >= texture->mipmap[i-1][4])
573                                 layer1 = texture->mipmap[i-1][4]-1;
574                         for (y = 0;y < texture->mipmap[i][3];y++)
575                         {
576                                 row0 = y*2;
577                                 row1 = y*2+1;
578                                 if (row1 >= texture->mipmap[i-1][3])
579                                         row1 = texture->mipmap[i-1][3]-1;
580                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
581                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
582                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
583                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
584                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
585                                 w = texture->mipmap[i][2];
586                                 if (layer1 > layer0)
587                                 {
588                                         if (texture->mipmap[i-1][2] > 1)
589                                         {
590                                                 // average 3D texture
591                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
592                                                 {
593                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
594                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
595                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
596                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
597                                                 }
598                                         }
599                                         else
600                                         {
601                                                 // average 3D mipmap with parent width == 1
602                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
603                                                 {
604                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
605                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
606                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
607                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
608                                                 }
609                                         }
610                                 }
611                                 else
612                                 {
613                                         if (texture->mipmap[i-1][2] > 1)
614                                         {
615                                                 // average 2D texture (common case)
616                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
617                                                 {
618                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
619                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
620                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
621                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
622                                                 }
623                                         }
624                                         else
625                                         {
626                                                 // 2D texture with parent width == 1
627                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
628                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
629                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
630                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
631                                         }
632                                 }
633                         }
634                 }
635         }
636 }
637 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
638 {
639         DPSOFTRAST_Texture *texture;
640         unsigned char *dst;
641         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
642         DPSOFTRAST_Flush();
643         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
644         while (blockheight > 0)
645         {
646                 memcpy(dst, pixels, blockwidth * 4);
647                 pixels += blockwidth * 4;
648                 dst += texture->mipmap[0][2] * 4;
649                 blockheight--;
650         }
651         DPSOFTRAST_Texture_CalculateMipmaps(index);
652 }
653 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
654 {
655         DPSOFTRAST_Texture *texture;
656         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
657         DPSOFTRAST_Flush();
658         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
659         DPSOFTRAST_Texture_CalculateMipmaps(index);
660 }
661 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
662 {
663         DPSOFTRAST_Texture *texture;
664         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
665         return texture->mipmap[mip][2];
666 }
667 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
668 {
669         DPSOFTRAST_Texture *texture;
670         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
671         return texture->mipmap[mip][3];
672 }
673 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
674 {
675         DPSOFTRAST_Texture *texture;
676         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
677         return texture->mipmap[mip][4];
678 }
679 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
680 {
681         DPSOFTRAST_Texture *texture;
682         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
683         DPSOFTRAST_Flush();
684         return texture->bytes + texture->mipmap[mip][0];
685 }
686 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
687 {
688         DPSOFTRAST_Texture *texture;
689         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
691         {
692                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
693                 return;
694         }
695         DPSOFTRAST_Flush();
696         texture->filter = filter;
697 }
698
699 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
700 {
701         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
702                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
703                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
704                 DPSOFTRAST_Flush();
705         dpsoftrast.fb_width = width;
706         dpsoftrast.fb_height = height;
707         dpsoftrast.fb_depthpixels = depthpixels;
708         dpsoftrast.fb_colorpixels[0] = colorpixels0;
709         dpsoftrast.fb_colorpixels[1] = colorpixels1;
710         dpsoftrast.fb_colorpixels[2] = colorpixels2;
711         dpsoftrast.fb_colorpixels[3] = colorpixels3;
712 }
713
714 void DPSOFTRAST_Draw_FlushThreads(void);
715
716 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
717 {
718         DPSOFTRAST_State_Thread *thread;
719         int i;
720         int freetriangle = dpsoftrast.trianglepool.freetriangle;
721         int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
722         if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
723             return;
724 #ifdef USE_THREADS
725         SDL_LockMutex(dpsoftrast.trianglemutex);
726 #endif
727         for(;;)
728         {
729             int waitindex = -1;
730             int triangleoffset;
731             usedtriangles = 0;
732             for (i = 0; i < dpsoftrast.numthreads; i++)
733             {
734                 thread = &dpsoftrast.threads[i];
735                 triangleoffset = freetriangle - thread->triangleoffset;
736                 if (triangleoffset < 0)
737                     triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
738                 if (triangleoffset > usedtriangles)
739                 {
740                     waitindex = i;
741                     usedtriangles = triangleoffset;
742                 }
743             }
744             if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
745                 break;
746 #ifdef USE_THREADS
747             thread = &dpsoftrast.threads[waitindex];
748             thread->waiting = true;
749             SDL_CondBroadcast(dpsoftrast.trianglecond);
750             SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
751             thread->waiting = false;
752 #endif
753         }
754 #ifdef USE_THREADS
755         SDL_UnlockMutex(dpsoftrast.trianglemutex);
756 #endif
757         dpsoftrast.trianglepool.usedtriangles = usedtriangles;
758 }
759
760 void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         DPSOFTRAST_State_Triangle *triangle;
763         if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
764 #ifdef USE_THREADS
765             DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
766 #else
767             DPSOFTRAST_Draw_FlushThreads();
768 #endif
769         triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
770         triangle->commandoffset = dpsoftrast.commandpool.freecommand;
771         triangle->starty = -1;
772         triangle->endy = -1;
773         dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
774         dpsoftrast.trianglepool.usedtriangles++;
775         MEMORY_BARRIER;
776         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
777 }
778
779 void DPSOFTRAST_Draw_FreeCommandPool(int space)
780 {
781         DPSOFTRAST_State_Thread *thread;
782         int i;
783         int freecommand = dpsoftrast.commandpool.freecommand;
784         int usedcommands = dpsoftrast.commandpool.usedcommands;
785         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
786                 return;
787         DPSOFTRAST_Draw_SyncCommands();
788 #ifdef USE_THREADS
789         SDL_LockMutex(dpsoftrast.trianglemutex);
790 #endif
791         for(;;)
792         {
793                 int waitindex = -1;
794                 int commandoffset;
795                 usedcommands = 0;
796                 for (i = 0; i < dpsoftrast.numthreads; i++)
797                 {
798                         thread = &dpsoftrast.threads[i]; 
799                         commandoffset = freecommand - thread->commandoffset;
800                         if (commandoffset < 0)
801                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
802                         if (commandoffset > usedcommands)
803                         {
804                                 waitindex = i;
805                                 usedcommands = commandoffset;
806                         }
807                 }
808                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
809                         break;
810 #ifdef USE_THREADS
811                 thread = &dpsoftrast.threads[waitindex];
812                 thread->waiting = true;
813                 SDL_CondBroadcast(dpsoftrast.trianglecond);
814                 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
815                 thread->waiting = false;
816 #endif
817         }
818 #ifdef USE_THREADS
819         SDL_UnlockMutex(dpsoftrast.trianglemutex);
820 #endif
821         dpsoftrast.commandpool.usedcommands = usedcommands;
822 }
823
824 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
825         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
826
827 static void *DPSOFTRAST_AllocateCommand(int size)
828 {
829         DPSOFTRAST_Command *command;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         int extra = sizeof(DPSOFTRAST_Command);
833         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
834                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
836         {
837 #ifdef USE_THREADS
838                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
839 #else
840                 DPSOFTRAST_Draw_FlushThreads();
841 #endif
842                 freecommand = dpsoftrast.commandpool.freecommand;
843                 usedcommands = dpsoftrast.commandpool.usedcommands;
844         }
845         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
846         {
847                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
848                 command->opcode = DPSOFTRAST_OPCODE_Reset;
849                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
850                 freecommand = 0;
851         }
852         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
853         freecommand += size;
854         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
855                 freecommand = 0;
856
857         dpsoftrast.commandpool.freecommand = freecommand;
858         dpsoftrast.commandpool.usedcommands = usedcommands + size;
859         return command;
860 }
861         
862 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
863 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
864 {
865         thread->viewport[0] = command->x;
866         thread->viewport[1] = command->y;
867         thread->viewport[2] = command->width;
868         thread->viewport[3] = command->height;
869         thread->validate |= DPSOFTRAST_VALIDATE_FB;
870 }
871 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
872 {
873         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874         command->opcode = DPSOFTRAST_OPCODE_Viewport;
875         command->x = x;
876         command->y = y;
877         command->width = width;
878         command->height = height;
879
880         dpsoftrast.viewport[0] = x;
881         dpsoftrast.viewport[1] = y;
882         dpsoftrast.viewport[2] = width;
883         dpsoftrast.viewport[3] = height;
884         dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
885         dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
886         dpsoftrast.fb_viewportcenter[3] = 0.5f;
887         dpsoftrast.fb_viewportcenter[0] = 0.0f;
888         dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
889         dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
890         dpsoftrast.fb_viewportscale[3] = 0.5f;
891         dpsoftrast.fb_viewportscale[0] = 1.0f;
892 }
893
894 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
895 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 {
897         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
898         unsigned int *p;
899         unsigned int c;
900         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
901         x1 = thread->fb_clearscissor[0];
902         y1 = thread->fb_clearscissor[1];
903         x2 = thread->fb_clearscissor[2];
904         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
905         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
906         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
907         if(y1 < t1) y1 = t1;
908         if(y2 > t2) y2 = t2;
909         w = x2 - x1;
910         h = y2 - y1;
911         if (w < 1 || h < 1)
912                 return;
913         // FIXME: honor fb_colormask?
914         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915         for (i = 0;i < 4;i++)
916         {
917                 if (!dpsoftrast.fb_colorpixels[i])
918                         continue;
919                 for (y = y1;y < y2;y++)
920                 {
921                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
922                         for (x = x1;x < x2;x++)
923                                 p[x] = c;
924                 }
925         }
926 }
927 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
928 {
929         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930         command->opcode = DPSOFTRAST_OPCODE_ClearColor;
931         command->r = r;
932         command->g = g;
933         command->b = b;
934         command->a = a;
935 }
936
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
939 {
940         int x1, y1, x2, y2, w, h, x, y, t1, t2;
941         unsigned int *p;
942         unsigned int c;
943         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
944         x1 = thread->fb_clearscissor[0];
945         y1 = thread->fb_clearscissor[1];
946         x2 = thread->fb_clearscissor[2];
947         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
948         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
949         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
950         if(y1 < t1) y1 = t1;
951         if(y2 > t2) y2 = t2;
952         w = x2 - x1;
953         h = y2 - y1;
954         if (w < 1 || h < 1)
955                 return;
956         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
957         for (y = y1;y < y2;y++)
958         {
959                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
960                 for (x = x1;x < x2;x++)
961                         p[x] = c;
962         }
963 }
964 void DPSOFTRAST_ClearDepth(float d)
965 {
966         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967         command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
968         command->depth = d;
969 }
970
971 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
972 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
973 {
974         thread->colormask[0] = command->r != 0;
975         thread->colormask[1] = command->g != 0;
976         thread->colormask[2] = command->b != 0;
977         thread->colormask[3] = command->a != 0;
978         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
979 }
980 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
981 {
982         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
983         command->opcode = DPSOFTRAST_OPCODE_ColorMask;
984         command->r = r;
985         command->g = g;
986         command->b = b;
987         command->a = a;
988 }
989
990 DEFCOMMAND(5, DepthTest, int enable;)
991 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
992 {
993         thread->depthtest = command->enable;
994         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
995 }
996 void DPSOFTRAST_DepthTest(int enable)
997 {
998         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
999         command->opcode = DPSOFTRAST_OPCODE_DepthTest;
1000         command->enable = enable;
1001 }
1002
1003 DEFCOMMAND(6, ScissorTest, int enable;)
1004 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1005 {
1006         thread->scissortest = command->enable;
1007         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1008 }
1009 void DPSOFTRAST_ScissorTest(int enable)
1010 {
1011         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1012         command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1018 {
1019         thread->scissor[0] = command->x;
1020         thread->scissor[1] = command->y;
1021         thread->scissor[2] = command->width;
1022         thread->scissor[3] = command->height;
1023         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1024 }
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1026 {
1027         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1028         command->opcode = DPSOFTRAST_OPCODE_Scissor;
1029         command->x = x;
1030         command->y = y;
1031         command->width = width;
1032         command->height = height;
1033 }
1034
1035 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1036 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 {
1038         thread->blendfunc[0] = command->sfactor;
1039         thread->blendfunc[1] = command->dfactor;
1040         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 }
1042 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 {
1044         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1045         command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1060         command->enable = enable;
1061 }
1062
1063 DEFCOMMAND(10, DepthMask, int enable;)
1064 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 {
1066         thread->depthmask = command->enable;
1067 }
1068 void DPSOFTRAST_DepthMask(int enable)
1069 {
1070         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1071         command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1072         command->enable = enable;
1073 }
1074
1075 DEFCOMMAND(11, DepthFunc, int func;)
1076 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1077 {
1078         thread->depthfunc = command->func;
1079 }
1080 void DPSOFTRAST_DepthFunc(int func)
1081 {
1082         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1083         command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1084         command->func = func;
1085 }
1086
1087 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1088 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1089 {
1090         thread->depthrange[0] = command->nearval;
1091         thread->depthrange[1] = command->farval;
1092 }
1093 void DPSOFTRAST_DepthRange(float nearval, float farval)
1094 {
1095         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1096         command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1097         command->nearval = nearval;
1098         command->farval = farval;
1099 }
1100
1101 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1102 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1103 {
1104         thread->polygonoffset[0] = command->alongnormal;
1105         thread->polygonoffset[1] = command->intoview;
1106 }
1107 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1108 {
1109         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1110         command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1111         command->alongnormal = alongnormal;
1112         command->intoview = intoview;
1113 }
1114
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         dpsoftrast.cullface = mode;
1118 }
1119
1120 DEFCOMMAND(15, AlphaTest, int enable;)
1121 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 {
1123         thread->alphatest = command->enable;
1124 }
1125 void DPSOFTRAST_AlphaTest(int enable)
1126 {
1127         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1128         command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1142         command->func = func;
1143         command->ref = ref;
1144 }
1145
1146 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 {
1148         dpsoftrast.color[0] = r;
1149         dpsoftrast.color[1] = g;
1150         dpsoftrast.color[2] = b;
1151         dpsoftrast.color[3] = a;
1152 }
1153
1154 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 {
1156         int outstride = blockwidth * 4;
1157         int instride = dpsoftrast.fb_width * 4;
1158         int bx1 = blockx;
1159         int by1 = blocky;
1160         int bx2 = blockx + blockwidth;
1161         int by2 = blocky + blockheight;
1162         int bw;
1163         int bh;
1164         int x;
1165         int y;
1166         unsigned char *inpixels;
1167         unsigned char *b;
1168         unsigned char *o;
1169         DPSOFTRAST_Flush();
1170         if (bx1 < 0) bx1 = 0;
1171         if (by1 < 0) by1 = 0;
1172         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1173         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1174         bw = bx2 - bx1;
1175         bh = by2 - by1;
1176         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1177         if (dpsoftrast.bigendian)
1178         {
1179                 for (y = by1;y < by2;y++)
1180                 {
1181                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1183                         for (x = bx1;x < bx2;x++)
1184                         {
1185                                 o[0] = b[3];
1186                                 o[1] = b[2];
1187                                 o[2] = b[1];
1188                                 o[3] = b[0];
1189                                 o += 4;
1190                                 b += 4;
1191                         }
1192                 }
1193         }
1194         else
1195         {
1196                 for (y = by1;y < by2;y++)
1197                 {
1198                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1199                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1200                         memcpy(o, b, bw*4);
1201                 }
1202         }
1203
1204 }
1205 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 {
1207         int tx1 = tx;
1208         int ty1 = ty;
1209         int tx2 = tx + width;
1210         int ty2 = ty + height;
1211         int sx1 = sx;
1212         int sy1 = sy;
1213         int sx2 = sx + width;
1214         int sy2 = sy + height;
1215         int swidth;
1216         int sheight;
1217         int twidth;
1218         int theight;
1219         int sw;
1220         int sh;
1221         int tw;
1222         int th;
1223         int y;
1224         unsigned int *spixels;
1225         unsigned int *tpixels;
1226         DPSOFTRAST_Texture *texture;
1227         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1228         if (mip < 0 || mip >= texture->mipmaps) return;
1229         DPSOFTRAST_Flush();
1230         spixels = dpsoftrast.fb_colorpixels[0];
1231         swidth = dpsoftrast.fb_width;
1232         sheight = dpsoftrast.fb_height;
1233         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1234         twidth = texture->mipmap[mip][2];
1235         theight = texture->mipmap[mip][3];
1236         if (tx1 < 0) tx1 = 0;
1237         if (ty1 < 0) ty1 = 0;
1238         if (tx2 > twidth) tx2 = twidth;
1239         if (ty2 > theight) ty2 = theight;
1240         if (sx1 < 0) sx1 = 0;
1241         if (sy1 < 0) sy1 = 0;
1242         if (sx2 > swidth) sx2 = swidth;
1243         if (sy2 > sheight) sy2 = sheight;
1244         tw = tx2 - tx1;
1245         th = ty2 - ty1;
1246         sw = sx2 - sx1;
1247         sh = sy2 - sy1;
1248         if (tw > sw) tw = sw;
1249         if (th > sh) th = sh;
1250         if (tw < 1 || th < 1)
1251                 return;
1252         for (y = 0;y < th;y++)
1253                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1254         if (texture->mipmaps > 1)
1255                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 }
1257
1258 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1259 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1260 {
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1281         command->unitnum = unitnum;
1282         command->texture = texture;
1283
1284         dpsoftrast.texbound[unitnum] = texture;
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->opcode = DPSOFTRAST_OPCODE_SetShader;
1321         command->mode = mode;
1322         command->permutation = permutation;
1323
1324         dpsoftrast.shader_mode = mode;
1325         dpsoftrast.shader_permutation = permutation;
1326 }
1327
1328 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1329 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 {
1331         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 }
1333 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 {
1335         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1336         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1337         command->index = index;
1338         command->val[0] = v0;
1339         command->val[1] = v1;
1340         command->val[2] = v2;
1341         command->val[3] = v3;
1342
1343         dpsoftrast.uniform4f[index*4+0] = v0;
1344         dpsoftrast.uniform4f[index*4+1] = v1;
1345         dpsoftrast.uniform4f[index*4+2] = v2;
1346         dpsoftrast.uniform4f[index*4+3] = v3;
1347 }
1348 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1349 {
1350         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1351         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1352         command->index = index;
1353         memcpy(command->val, v, sizeof(command->val));
1354
1355         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1356 }
1357
1358 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1359 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1360 {
1361         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1362 }
1363 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 {
1365 #ifdef SSE2_PRESENT
1366         int i, index;
1367         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1368         {
1369                 __m128 m0, m1, m2, m3;
1370                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1371                 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1372                 command->index = index;
1373                 if (((size_t)v)&(ALIGN_SIZE-1))
1374                 {
1375                         m0 = _mm_loadu_ps(v);
1376                         m1 = _mm_loadu_ps(v+4);
1377                         m2 = _mm_loadu_ps(v+8);
1378                         m3 = _mm_loadu_ps(v+12);
1379                 }
1380                 else
1381                 {
1382                         m0 = _mm_load_ps(v);
1383                         m1 = _mm_load_ps(v+4);
1384                         m2 = _mm_load_ps(v+8);
1385                         m3 = _mm_load_ps(v+12);
1386                 }
1387                 if (transpose)
1388                 {
1389                         __m128 t0, t1, t2, t3;
1390                         t0 = _mm_unpacklo_ps(m0, m1);
1391                         t1 = _mm_unpacklo_ps(m2, m3);
1392                         t2 = _mm_unpackhi_ps(m0, m1);
1393                         t3 = _mm_unpackhi_ps(m2, m3);
1394                         m0 = _mm_movelh_ps(t0, t1);
1395                         m1 = _mm_movehl_ps(t1, t0);
1396                         m2 = _mm_movelh_ps(t2, t3);
1397                         m3 = _mm_movehl_ps(t3, t2);                     
1398                 }
1399                 _mm_store_ps(command->val, m0);
1400                 _mm_store_ps(command->val+4, m1);
1401                 _mm_store_ps(command->val+8, m2);
1402                 _mm_store_ps(command->val+12, m3);
1403                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1404                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1405                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1406                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407         }
1408 #endif
1409 }
1410
1411 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1412 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1413 {
1414         thread->uniform1i[command->index] = command->val;
1415 }
1416 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1417 {
1418         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1419         command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1420         command->index = index;
1421         command->val = i0;
1422
1423         dpsoftrast.uniform1i[command->index] = i0;
1424 }
1425
1426 #ifdef SSE2_PRESENT
1427 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1428 {
1429         float *end = dst + size*4;
1430         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1431         {
1432                 while (dst < end)
1433                 {
1434                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1435                         dst += 4;
1436                         src += stride;
1437                 }
1438         }
1439         else
1440         {
1441                 while (dst < end)
1442                 {
1443                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1444                         dst += 4;
1445                         src += stride;
1446                 }
1447         }
1448 }
1449
1450 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1451 {
1452         float *end = dst + size*4;
1453         if (stride == sizeof(float[3]))
1454         {
1455                 float *end4 = dst + (size&~3)*4;        
1456                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1457                 {
1458                         while (dst < end4)
1459                         {
1460                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1461                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1462                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1465                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1466                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1468                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1469                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dst += 16;
1474                                 src += 4*sizeof(float[3]);
1475                         }
1476                 }
1477                 else
1478                 {
1479                         while (dst < end4)
1480                         {
1481                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1482                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1483                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1486                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1487                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1489                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1490                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1493                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1494                                 dst += 16;
1495                                 src += 4*sizeof(float[3]);
1496                         }
1497                 }
1498         }
1499         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1500         {
1501                 while (dst < end)
1502                 {
1503                         __m128 v = _mm_loadu_ps((const float *)src);
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507                         _mm_store_ps(dst, v);
1508                         dst += 4;
1509                         src += stride;
1510                 }
1511         }
1512         else
1513         {
1514                 while (dst < end)
1515                 {
1516                         __m128 v = _mm_load_ps((const float *)src);
1517                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1518                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1519                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1520                         _mm_store_ps(dst, v);
1521                         dst += 4;
1522                         src += stride;
1523                 }
1524         }
1525 }
1526
1527 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1528 {
1529         float *end = dst + size*4;
1530         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1531         if (stride == sizeof(float[2]))
1532         {
1533                 float *end2 = dst + (size&~1)*4;
1534                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1535                 {
1536                         while (dst < end2)
1537                         {
1538                                 __m128 v = _mm_loadu_ps((const float *)src);
1539                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1540                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1541                                 dst += 8;
1542                                 src += 2*sizeof(float[2]);
1543                         }
1544                 }
1545                 else
1546                 {
1547                         while (dst < end2)
1548                         {
1549                                 __m128 v = _mm_load_ps((const float *)src);
1550                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1551                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1552                                 dst += 8;
1553                                 src += 2*sizeof(float[2]);
1554                         }
1555                 }
1556         }
1557         while (dst < end)
1558         {
1559                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560                 dst += 4;
1561                 src += stride;
1562         }
1563 }
1564
1565 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1566 {
1567         float *end = dst + size*4;
1568         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1569         if (stride == sizeof(unsigned char[4]))
1570         {
1571                 float *end4 = dst + (size&~3)*4;
1572                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1573                 {
1574                         while (dst < end4)
1575                         {
1576                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1581                                 dst += 16;
1582                                 src += 4*sizeof(unsigned char[4]);
1583                         }
1584                 }
1585                 else
1586                 {
1587                 while (dst < end4)
1588                 {
1589                     __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1590                     _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1591                     _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1592                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1593                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594                     dst += 16;
1595                     src += 4*sizeof(unsigned char[4]);
1596                 }
1597                 }
1598         }
1599         while (dst < end)
1600         {
1601                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1602                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603                 dst += 4;
1604                 src += stride;
1605         }
1606 }
1607
1608 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1609 {
1610         float *end = dst + 4*size;
1611         __m128 v = _mm_loadu_ps(src);
1612         while (dst < end)
1613         {
1614                 _mm_store_ps(dst, v);
1615                 dst += 4;
1616         }
1617 }
1618 #endif
1619
1620 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1621 {
1622 #ifdef SSE2_PRESENT
1623         int i;
1624         int j;
1625         int stride;
1626         const float *v;
1627         float *p;
1628         float *data;
1629         const unsigned char *b;
1630         dpsoftrast.numvertices = numvertices;
1631         if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1632         {
1633                 if (dpsoftrast.maxvertices < 4096)
1634                         dpsoftrast.maxvertices = 4096;
1635                 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1636                         dpsoftrast.maxvertices *= 2;
1637                 if (dpsoftrast.in_array4f[0])
1638                         MM_FREE(dpsoftrast.in_array4f[0]);
1639                 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1640                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1641                         dpsoftrast.in_array4f[i] = data;
1642                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1643                         dpsoftrast.post_array4f[i] = data;
1644                 dpsoftrast.screencoord4f = data;
1645                 data += dpsoftrast.maxvertices * 4;
1646         }
1647         stride = dpsoftrast.stride_vertex;
1648         v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1649         p = dpsoftrast.in_array4f[0];
1650         DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1651         if (needcolors)
1652         {
1653                 if (dpsoftrast.pointer_color4f)
1654                 {
1655                         stride = dpsoftrast.stride_color;
1656                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1657                         p = dpsoftrast.in_array4f[1];
1658                         DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1659                 }
1660                 else if (dpsoftrast.pointer_color4ub)
1661                 {
1662                         stride = dpsoftrast.stride_color;
1663                         b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1664                         p = dpsoftrast.in_array4f[1];
1665                         DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1666                 }
1667                 else
1668                 {
1669                         p = dpsoftrast.in_array4f[1];
1670                         DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1671                 }
1672         }
1673         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1674         {
1675                 if (dpsoftrast.pointer_texcoordf[j])
1676                 {
1677                         stride = dpsoftrast.stride_texcoord[j];
1678                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1679                         p = dpsoftrast.in_array4f[j+2];
1680                         switch(dpsoftrast.components_texcoord[j])
1681                         {
1682                         case 2:
1683                                 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1684                                 break;
1685                         case 3:
1686                                 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1687                                 break;
1688                         case 4:
1689                                 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1690                                 break;
1691                         }
1692                 }
1693         }
1694 #endif
1695 }
1696
1697 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1698 {
1699 #ifdef SSE2_PRESENT
1700         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1701         __m128 m0, m1, m2, m3;
1702         float *end = out4f + numitems*4;
1703         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1704         {
1705                 // fast case for identity matrix
1706                 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1707                 return;
1708         }
1709         m0 = _mm_loadu_ps(inmatrix16f);
1710         m1 = _mm_loadu_ps(inmatrix16f + 4);
1711         m2 = _mm_loadu_ps(inmatrix16f + 8);
1712         m3 = _mm_loadu_ps(inmatrix16f + 12);
1713         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1714         {
1715                 while (out4f < end)
1716                 {
1717                         __m128 v = _mm_loadu_ps(in4f);
1718                         _mm_store_ps(out4f,
1719                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1720                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1721                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1722                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1723                         out4f += 4;
1724                         in4f += 4;
1725                 }
1726         }
1727         else
1728         {
1729                 while (out4f < end)
1730                 {
1731                         __m128 v = _mm_load_ps(in4f);
1732                         _mm_store_ps(out4f,
1733                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1734                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1735                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1736                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1737                         out4f += 4;
1738                         in4f += 4;
1739                 }
1740         }
1741 #endif
1742 }
1743
1744 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1745 {
1746         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1747 }
1748
1749 #ifdef SSE2_PRESENT
1750 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1751 {
1752         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1753         __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1754         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1755         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1756         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1757         return v;
1758 }
1759 #endif
1760
1761 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1762 {
1763 #ifdef SSE2_PRESENT
1764         float *end = out4f + numitems*4;
1765         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1766         while (out4f < end)
1767         {
1768                 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1769                 _mm_store_ps(out4f, v);
1770                 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1771                 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1772                 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1773                 in4f += 4;
1774                 out4f += 4;
1775                 screen4f += 4;
1776         }
1777 #endif
1778 }
1779
1780 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1781 {
1782 #ifdef SSE2_PRESENT
1783         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1784         __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1785         float *end = out4f + numitems*4;
1786         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1787         {
1788                 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1789                 return;
1790         }
1791         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1792         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1793         m0 = _mm_loadu_ps(inmatrix16f);
1794         m1 = _mm_loadu_ps(inmatrix16f + 4);
1795         m2 = _mm_loadu_ps(inmatrix16f + 8);
1796         m3 = _mm_loadu_ps(inmatrix16f + 12);
1797         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1798         {
1799                 while (out4f < end)
1800                 {
1801                         __m128 v = _mm_loadu_ps(in4f), w;
1802                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1803                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1804                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1805                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1806                         _mm_store_ps(out4f, v);
1807                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1808                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1809                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1810                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1811                         in4f += 4;
1812                         out4f += 4;
1813                         screen4f += 4;
1814                 }
1815         }
1816         else
1817         {
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_load_ps(in4f), w;
1821                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1822                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1823                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1824                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1825                         _mm_store_ps(out4f, v);
1826                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1827                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1828                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1829                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1830                         in4f += 4;
1831                         out4f += 4;
1832                         screen4f += 4;
1833                 }
1834         }
1835 #endif
1836 }
1837
1838 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1839 {
1840         int x;
1841         int startx = span->startx;
1842         int endx = span->endx;
1843         float wslope = triangle->w[0];
1844         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1845         float endz = 1.0f / (w + wslope * startx);
1846         for (x = startx;x < endx;)
1847         {
1848                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1849                 float z = endz, dz;
1850                 if(nextsub >= endx) nextsub = endsub = endx-1;
1851                 endz = 1.0f / (w + wslope * nextsub);
1852                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1853                 for (; x <= endsub; x++, z += dz)
1854                         zf[x] = z;
1855         }
1856 }
1857
1858 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1859 {
1860         int x;
1861         int startx = span->startx;
1862         int endx = span->endx;
1863         int d[4];
1864         float a, b;
1865         unsigned char * RESTRICT pixelmask = span->pixelmask;
1866         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1867         if (!pixel)
1868                 return;
1869         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1870         // handle alphatest now (this affects depth writes too)
1871         if (thread->alphatest)
1872                 for (x = startx;x < endx;x++)
1873                         if (in4f[x*4+3] < 0.5f)
1874                                 pixelmask[x] = false;
1875         // FIXME: this does not handle bigendian
1876         switch(thread->fb_blendmode)
1877         {
1878         case DPSOFTRAST_BLENDMODE_OPAQUE:
1879                 for (x = startx;x < endx;x++)
1880                 {
1881                         if (!pixelmask[x])
1882                                 continue;
1883                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1884                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1885                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1886                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1887                         pixel[x*4+0] = d[0];
1888                         pixel[x*4+1] = d[1];
1889                         pixel[x*4+2] = d[2];
1890                         pixel[x*4+3] = d[3];
1891                 }
1892                 break;
1893         case DPSOFTRAST_BLENDMODE_ALPHA:
1894                 for (x = startx;x < endx;x++)
1895                 {
1896                         if (!pixelmask[x])
1897                                 continue;
1898                         a = in4f[x*4+3] * 255.0f;
1899                         b = 1.0f - in4f[x*4+3];
1900                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1901                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1902                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1903                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1904                         pixel[x*4+0] = d[0];
1905                         pixel[x*4+1] = d[1];
1906                         pixel[x*4+2] = d[2];
1907                         pixel[x*4+3] = d[3];
1908                 }
1909                 break;
1910         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1911                 for (x = startx;x < endx;x++)
1912                 {
1913                         if (!pixelmask[x])
1914                                 continue;
1915                         a = in4f[x*4+3] * 255.0f;
1916                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1917                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1918                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1919                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1920                         pixel[x*4+0] = d[0];
1921                         pixel[x*4+1] = d[1];
1922                         pixel[x*4+2] = d[2];
1923                         pixel[x*4+3] = d[3];
1924                 }
1925                 break;
1926         case DPSOFTRAST_BLENDMODE_ADD:
1927                 for (x = startx;x < endx;x++)
1928                 {
1929                         if (!pixelmask[x])
1930                                 continue;
1931                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1932                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1933                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1934                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1935                         pixel[x*4+0] = d[0];
1936                         pixel[x*4+1] = d[1];
1937                         pixel[x*4+2] = d[2];
1938                         pixel[x*4+3] = d[3];
1939                 }
1940                 break;
1941         case DPSOFTRAST_BLENDMODE_INVMOD:
1942                 for (x = startx;x < endx;x++)
1943                 {
1944                         if (!pixelmask[x])
1945                                 continue;
1946                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1947                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1948                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1949                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1950                         pixel[x*4+0] = d[0];
1951                         pixel[x*4+1] = d[1];
1952                         pixel[x*4+2] = d[2];
1953                         pixel[x*4+3] = d[3];
1954                 }
1955                 break;
1956         case DPSOFTRAST_BLENDMODE_MUL:
1957                 for (x = startx;x < endx;x++)
1958                 {
1959                         if (!pixelmask[x])
1960                                 continue;
1961                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1962                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1963                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1964                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1965                         pixel[x*4+0] = d[0];
1966                         pixel[x*4+1] = d[1];
1967                         pixel[x*4+2] = d[2];
1968                         pixel[x*4+3] = d[3];
1969                 }
1970                 break;
1971         case DPSOFTRAST_BLENDMODE_MUL2:
1972                 for (x = startx;x < endx;x++)
1973                 {
1974                         if (!pixelmask[x])
1975                                 continue;
1976                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1977                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1978                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1979                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1980                         pixel[x*4+0] = d[0];
1981                         pixel[x*4+1] = d[1];
1982                         pixel[x*4+2] = d[2];
1983                         pixel[x*4+3] = d[3];
1984                 }
1985                 break;
1986         case DPSOFTRAST_BLENDMODE_SUBALPHA:
1987                 for (x = startx;x < endx;x++)
1988                 {
1989                         if (!pixelmask[x])
1990                                 continue;
1991                         a = in4f[x*4+3] * -255.0f;
1992                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1993                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1994                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1995                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         }
2020 }
2021
2022 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2023 {
2024 #ifdef SSE2_PRESENT
2025         int x;
2026         int startx = span->startx;
2027         int endx = span->endx;
2028         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2029         unsigned char * RESTRICT pixelmask = span->pixelmask;
2030         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2031         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2032         if (!pixel)
2033                 return;
2034         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2035         pixeli += span->y * dpsoftrast.fb_width + span->x;
2036         // handle alphatest now (this affects depth writes too)
2037         if (thread->alphatest)
2038                 for (x = startx;x < endx;x++)
2039                         if (in4ub[x*4+3] < 0.5f)
2040                                 pixelmask[x] = false;
2041         // FIXME: this does not handle bigendian
2042         switch(thread->fb_blendmode)
2043         {
2044         case DPSOFTRAST_BLENDMODE_OPAQUE:
2045                 for (x = startx;x + 4 <= endx;)
2046                 {
2047                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2048                         {
2049                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2050                                 x += 4;
2051                         }
2052                         else
2053                         {
2054                                 if (pixelmask[x])
2055                                         pixeli[x] = ini[x];
2056                                 x++;
2057                         }
2058                 }
2059                 for (;x < endx;x++)
2060                         if (pixelmask[x])
2061                                 pixeli[x] = ini[x];
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_ALPHA:
2064         #define FINISHBLEND(blend2, blend1) \
2065                 for (x = startx;x + 2 <= endx;x += 2) \
2066                 { \
2067                         __m128i src, dst; \
2068                         switch (*(const unsigned short*)&pixelmask[x]) \
2069                         { \
2070                         case 0x0101: \
2071                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2072                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2073                                 blend2; \
2074                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2075                                 continue; \
2076                         case 0x0100: \
2077                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2078                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2079                                 blend1; \
2080                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2081                                 continue; \
2082                         case 0x0001: \
2083                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2084                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2085                                 blend1; \
2086                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2087                                 continue; \
2088                         } \
2089                         break; \
2090                 } \
2091                 for(;x < endx; x++) \
2092                 { \
2093                         __m128i src, dst; \
2094                         if (!pixelmask[x]) \
2095                                 continue; \
2096                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2097                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2098                         blend1; \
2099                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2100                 }
2101
2102                 FINISHBLEND({
2103                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2104                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2105                 }, {
2106                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2107                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2108                 });
2109                 break;
2110         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2111                 FINISHBLEND({
2112                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2113                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2114                 }, {
2115                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2116                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2117                 });
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_ADD:
2120                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2121                 break;
2122         case DPSOFTRAST_BLENDMODE_INVMOD:
2123                 FINISHBLEND({
2124                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2125                 }, {
2126                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2127                 });
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_MUL:
2130                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_MUL2:
2133                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2134                 break;
2135         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2136                 FINISHBLEND({
2137                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2138                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2139                 }, {
2140                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2141                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2142                 });
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2145                 FINISHBLEND({
2146                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2147                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2148                 }, {
2149                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2150                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2151                 });
2152                 break;
2153         }
2154 #endif
2155 }
2156
2157 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2158 {
2159         int x;
2160         int startx = span->startx;
2161         int endx = span->endx;
2162         int flags;
2163         float c[4];
2164         float data[4];
2165         float slope[4];
2166         float tc[2], endtc[2];
2167         float tcscale[2];
2168         unsigned int tci[2];
2169         unsigned int tci1[2];
2170         unsigned int tcimin[2];
2171         unsigned int tcimax[2];
2172         int tciwrapmask[2];
2173         int tciwidth;
2174         int filter;
2175         int mip;
2176         const unsigned char * RESTRICT pixelbase;
2177         const unsigned char * RESTRICT pixel[4];
2178         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2179         // if no texture is bound, just fill it with white
2180         if (!texture)
2181         {
2182                 for (x = startx;x < endx;x++)
2183                 {
2184                         out4f[x*4+0] = 1.0f;
2185                         out4f[x*4+1] = 1.0f;
2186                         out4f[x*4+2] = 1.0f;
2187                         out4f[x*4+3] = 1.0f;
2188                 }
2189                 return;
2190         }
2191         mip = triangle->mip[texunitindex];
2192         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2193         // if this mipmap of the texture is 1 pixel, just fill it with that color
2194         if (texture->mipmap[mip][1] == 4)
2195         {
2196                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2197                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2198                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2199                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2200                 for (x = startx;x < endx;x++)
2201                 {
2202                         out4f[x*4+0] = c[0];
2203                         out4f[x*4+1] = c[1];
2204                         out4f[x*4+2] = c[2];
2205                         out4f[x*4+3] = c[3];
2206                 }
2207                 return;
2208         }
2209         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2210         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2211         flags = texture->flags;
2212         tcscale[0] = texture->mipmap[mip][2];
2213         tcscale[1] = texture->mipmap[mip][3];
2214         tciwidth = texture->mipmap[mip][2];
2215         tcimin[0] = 0;
2216         tcimin[1] = 0;
2217         tcimax[0] = texture->mipmap[mip][2]-1;
2218         tcimax[1] = texture->mipmap[mip][3]-1;
2219         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2220         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2221         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2222         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2223         for (x = startx;x < endx;)
2224         {
2225                 unsigned int subtc[2];
2226                 unsigned int substep[2];
2227                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2228                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2229                 if(nextsub >= endx)
2230                 {
2231                         nextsub = endsub = endx-1;      
2232                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2233                 }
2234                 tc[0] = endtc[0];
2235                 tc[1] = endtc[1];
2236                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2237                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2238                 substep[0] = (endtc[0] - tc[0]) * subscale;
2239                 substep[1] = (endtc[1] - tc[1]) * subscale;
2240                 subtc[0] = tc[0] * (1<<16);
2241                 subtc[1] = tc[1] * (1<<16);
2242                 if(filter)
2243                 {
2244                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2245                         {
2246                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2247                                 {
2248                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2249                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2250                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2251                                         tci[0] = subtc[0]>>16;
2252                                         tci[1] = subtc[1]>>16;
2253                                         tci1[0] = tci[0] + 1;
2254                                         tci1[1] = tci[1] + 1;
2255                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2256                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2257                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2258                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2259                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2260                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2261                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2262                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2263                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2264                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2265                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2266                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2267                                         out4f[x*4+0] = c[0];
2268                                         out4f[x*4+1] = c[1];
2269                                         out4f[x*4+2] = c[2];
2270                                         out4f[x*4+3] = c[3];
2271                                 }
2272                         }
2273                         else
2274                         {
2275                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2276                                 {
2277                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2278                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2279                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2280                                         tci[0] = subtc[0]>>16;
2281                                         tci[1] = subtc[1]>>16;
2282                                         tci1[0] = tci[0] + 1;
2283                                         tci1[1] = tci[1] + 1;
2284                                         tci[0] &= tciwrapmask[0];
2285                                         tci[1] &= tciwrapmask[1];
2286                                         tci1[0] &= tciwrapmask[0];
2287                                         tci1[1] &= tciwrapmask[1];
2288                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2289                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2290                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2291                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2292                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2293                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2294                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2295                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2296                                         out4f[x*4+0] = c[0];
2297                                         out4f[x*4+1] = c[1];
2298                                         out4f[x*4+2] = c[2];
2299                                         out4f[x*4+3] = c[3];
2300                                 }
2301                         }
2302                 }
2303                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2306                         {
2307                                 tci[0] = subtc[0]>>16;
2308                                 tci[1] = subtc[1]>>16;
2309                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2310                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2311                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2312                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2313                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2314                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2315                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2316                                 out4f[x*4+0] = c[0];
2317                                 out4f[x*4+1] = c[1];
2318                                 out4f[x*4+2] = c[2];
2319                                 out4f[x*4+3] = c[3];
2320                         }
2321                 }
2322                 else
2323                 {
2324                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2325                         {
2326                                 tci[0] = subtc[0]>>16;
2327                                 tci[1] = subtc[1]>>16;
2328                                 tci[0] &= tciwrapmask[0];
2329                                 tci[1] &= tciwrapmask[1];
2330                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2331                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2332                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2333                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2334                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2335                                 out4f[x*4+0] = c[0];
2336                                 out4f[x*4+1] = c[1];
2337                                 out4f[x*4+2] = c[2];
2338                                 out4f[x*4+3] = c[3];
2339                         }
2340                 }
2341         }
2342 }
2343
2344 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2345 {
2346 #ifdef SSE2_PRESENT
2347         int x;
2348         int startx = span->startx;
2349         int endx = span->endx;
2350         int flags;
2351         __m128 data, slope, tcscale;
2352         __m128i tcsize, tcmask, tcoffset, tcmax;
2353         __m128 tc, endtc;
2354         __m128i subtc, substep, endsubtc;
2355         int filter;
2356         int mip;
2357         unsigned int *outi = (unsigned int *)out4ub;
2358         const unsigned char * RESTRICT pixelbase;
2359         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2360         // if no texture is bound, just fill it with white
2361         if (!texture)
2362         {
2363                 memset(out4ub + startx*4, 255, span->length*4);
2364                 return;
2365         }
2366         mip = triangle->mip[texunitindex];
2367         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2368         // if this mipmap of the texture is 1 pixel, just fill it with that color
2369         if (texture->mipmap[mip][1] == 4)
2370         {
2371                 unsigned int k = *((const unsigned int *)pixelbase);
2372                 for (x = startx;x < endx;x++)
2373                         outi[x] = k;
2374                 return;
2375         }
2376         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2377         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2378         flags = texture->flags;
2379         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2380         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2381         tcscale = _mm_cvtepi32_ps(tcsize);
2382         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2383         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2384         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2385         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2386         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2387         tcmax = _mm_packs_epi32(tcmask, tcmask);
2388         for (x = startx;x < endx;)
2389         {
2390                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2391                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2392                 if(nextsub >= endx)
2393                 {
2394                         nextsub = endsub = endx-1;
2395                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2396                 }       
2397                 tc = endtc;
2398                 subtc = endsubtc;
2399                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2400                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2401                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2402                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2403                 substep = _mm_slli_epi32(substep, 1);
2404                 if (filter)
2405                 {
2406                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2407                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2408                         {
2409                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2410                                 {
2411                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2412                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
2413                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2414                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2415                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
2416                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
2417                                         fracm = _mm_srli_epi16(subtc, 1);
2418                                         pix1 = _mm_add_epi16(pix1,
2419                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2420                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2421                                         pix3 = _mm_add_epi16(pix3,
2422                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2423                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2424                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2425                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2426                                         pix2 = _mm_add_epi16(pix2,
2427                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2428                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2429                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2430                                 }
2431                                 if (x <= endsub)
2432                                 {
2433                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2434                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
2435                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2436                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2437                                         fracm = _mm_srli_epi16(subtc, 1);
2438                                         pix1 = _mm_add_epi16(pix1,
2439                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2440                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2441                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2442                                         pix1 = _mm_add_epi16(pix1,
2443                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2444                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2445                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2446                                         x++;
2447                                 }
2448                         }
2449                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2450                         {
2451                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2452                                 {
2453                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2454                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2455                                         tci = _mm_madd_epi16(tci, tcoffset);
2456                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2457                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2458                                                                                         _mm_setzero_si128());
2459                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2460                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2461                                                                                         _mm_setzero_si128());
2462                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2463                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2464                                         tci = _mm_madd_epi16(tci, tcoffset);
2465                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2466                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2467                                                                                         _mm_setzero_si128());
2468                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2469                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2470                                                                                         _mm_setzero_si128());
2471                                         fracm = _mm_srli_epi16(subtc, 1);
2472                                         pix1 = _mm_add_epi16(pix1,
2473                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2474                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2475                                         pix3 = _mm_add_epi16(pix3,
2476                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2477                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2478                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2479                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2480                                         pix2 = _mm_add_epi16(pix2,
2481                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2482                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2483                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2484                                 }
2485                                 if (x <= endsub)
2486                                 {
2487                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2488                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2489                                         tci = _mm_madd_epi16(tci, tcoffset);
2490                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2491                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2492                                                                                         _mm_setzero_si128());
2493                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2494                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2495                                                                                         _mm_setzero_si128());
2496                                         fracm = _mm_srli_epi16(subtc, 1);
2497                                         pix1 = _mm_add_epi16(pix1,
2498                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2499                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2500                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2501                                         pix1 = _mm_add_epi16(pix1,
2502                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2503                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2504                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2505                                         x++;
2506                                 }
2507                         }
2508                         else
2509                         {
2510                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2511                                 {
2512                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2513                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2514                                         tci = _mm_madd_epi16(tci, tcoffset);
2515                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2516                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2517                                                                                         _mm_setzero_si128());
2518                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2519                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2520                                                                                         _mm_setzero_si128());
2521                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2522                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2523                                         tci = _mm_madd_epi16(tci, tcoffset);
2524                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2525                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2526                                                                                         _mm_setzero_si128());
2527                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2528                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2529                                                                                         _mm_setzero_si128());
2530                                         fracm = _mm_srli_epi16(subtc, 1);
2531                                         pix1 = _mm_add_epi16(pix1,
2532                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2533                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2534                                         pix3 = _mm_add_epi16(pix3,
2535                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2536                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2537                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2538                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2539                                         pix2 = _mm_add_epi16(pix2,
2540                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2541                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2542                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2543                                 }
2544                                 if (x <= endsub)
2545                                 {
2546                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2547                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2548                                         tci = _mm_madd_epi16(tci, tcoffset);
2549                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2550                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2551                                                                                         _mm_setzero_si128());
2552                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2553                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2554                                                                                         _mm_setzero_si128());
2555                                         fracm = _mm_srli_epi16(subtc, 1);
2556                                         pix1 = _mm_add_epi16(pix1,
2557                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2558                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2559                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2560                                         pix1 = _mm_add_epi16(pix1,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2563                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2564                                         x++;
2565                                 }
2566                         }
2567                 }
2568                 else
2569                 {
2570                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2571                         {
2572                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2573                                 {
2574                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2575                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2576                                         tci = _mm_madd_epi16(tci, tcoffset);
2577                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2578                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2579                                 }
2580                                 if (x <= endsub)
2581                                 {
2582                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2583                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2584                                         tci = _mm_madd_epi16(tci, tcoffset);
2585                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2586                                         x++;
2587                                 }
2588                         }
2589                         else
2590                         {
2591                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2592                                 {
2593                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2594                                         tci = _mm_and_si128(tci, tcmax); 
2595                                         tci = _mm_madd_epi16(tci, tcoffset);
2596                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2597                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2598                                 }
2599                                 if (x <= endsub)
2600                                 {
2601                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2602                                         tci = _mm_and_si128(tci, tcmax); 
2603                                         tci = _mm_madd_epi16(tci, tcoffset);
2604                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2605                                         x++;
2606                                 }
2607                         }
2608                 }
2609         }
2610 #endif
2611 }
2612
2613 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2614 {
2615         // TODO: IMPLEMENT
2616         memset(out4ub, 255, span->length*4);
2617 }
2618
2619 float DPSOFTRAST_SampleShadowmap(const float *vector)
2620 {
2621         // TODO: IMPLEMENT
2622         return 1.0f;
2623 }
2624
2625 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2626 {
2627         int x;
2628         int startx = span->startx;
2629         int endx = span->endx;
2630         float c[4];
2631         float data[4];
2632         float slope[4];
2633         float z;
2634         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2635         for (x = startx;x < endx;x++)
2636         {
2637                 z = zf[x];
2638                 c[0] = (data[0] + slope[0]*x) * z;
2639                 c[1] = (data[1] + slope[1]*x) * z;
2640                 c[2] = (data[2] + slope[2]*x) * z;
2641                 c[3] = (data[3] + slope[3]*x) * z;
2642                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2643                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2644                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2645                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2646         }
2647 }
2648
2649 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2650 {
2651         int x;
2652         int startx = span->startx;
2653         int endx = span->endx;
2654         float c[4];
2655         float data[4];
2656         float slope[4];
2657         float z;
2658         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2659         for (x = startx;x < endx;x++)
2660         {
2661                 z = zf[x];
2662                 c[0] = (data[0] + slope[0]*x) * z;
2663                 c[1] = (data[1] + slope[1]*x) * z;
2664                 c[2] = (data[2] + slope[2]*x) * z;
2665                 c[3] = (data[3] + slope[3]*x) * z;
2666                 out4f[x*4+0] = c[0];
2667                 out4f[x*4+1] = c[1];
2668                 out4f[x*4+2] = c[2];
2669                 out4f[x*4+3] = c[3];
2670         }
2671 }
2672
2673 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2674 {
2675         int x, startx = span->startx, endx = span->endx;
2676         float c[4], localcolor[4];
2677         localcolor[0] = subcolor[0];
2678         localcolor[1] = subcolor[1];
2679         localcolor[2] = subcolor[2];
2680         localcolor[3] = subcolor[3];
2681         for (x = startx;x < endx;x++)
2682         {
2683                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2684                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2685                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2686                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2687                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2688                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2689                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2690                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2691         }
2692 }
2693
2694 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2695 {
2696         int x, startx = span->startx, endx = span->endx;
2697         for (x = startx;x < endx;x++)
2698         {
2699                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2700                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2701                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2702                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2703         }
2704 }
2705
2706 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2707 {
2708         int x, startx = span->startx, endx = span->endx;
2709         for (x = startx;x < endx;x++)
2710         {
2711                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2712                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2713                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2714                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2715         }
2716 }
2717
2718 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2719 {
2720         int x, startx = span->startx, endx = span->endx;
2721         float a, b;
2722         for (x = startx;x < endx;x++)
2723         {
2724                 a = 1.0f - inb4f[x*4+3];
2725                 b = inb4f[x*4+3];
2726                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2727                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2728                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2729                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2730         }
2731 }
2732
2733 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2734 {
2735         int x, startx = span->startx, endx = span->endx;
2736         float localcolor[4], ilerp, lerp;
2737         localcolor[0] = color[0];
2738         localcolor[1] = color[1];
2739         localcolor[2] = color[2];
2740         localcolor[3] = color[3];
2741         ilerp = 1.0f - localcolor[3];
2742         lerp = localcolor[3];
2743         for (x = startx;x < endx;x++)
2744         {
2745                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2746                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2747                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2748                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2749         }
2750 }
2751
2752
2753
2754 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2755 {
2756 #ifdef SSE2_PRESENT
2757         int x;
2758         int startx = span->startx;
2759         int endx = span->endx;
2760         __m128 data, slope;
2761         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2762         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2763         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2764         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2765         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2766         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2767         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2768         {
2769                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2770                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2771                 data = _mm_add_ps(data, slope);
2772                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2773                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2774                 pix = _mm_mulhi_epu16(pix, mod);
2775                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2776         }
2777         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2778         {
2779                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2780                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2781                 mod = _mm_packs_epi32(mod, mod);
2782                 pix = _mm_mulhi_epu16(pix, mod);
2783                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2784         }
2785 #endif
2786 }
2787
2788 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2789 {
2790 #ifdef SSE2_PRESENT
2791         int x;
2792         int startx = span->startx;
2793         int endx = span->endx;
2794         __m128 data, slope;
2795         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2796         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2797         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2798         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2799         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2800         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2801         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2802         {
2803                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2804                 data = _mm_add_ps(data, slope);
2805                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2806                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2807                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2808         }
2809         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2810         {
2811                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2812                 pix = _mm_packs_epi32(pix, pix);
2813                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2814         }
2815 #endif
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2819 {
2820 #ifdef SSE2_PRESENT
2821         int x, startx = span->startx, endx = span->endx;
2822         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2823         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2824         for (x = startx;x+2 <= endx;x+=2)
2825         {
2826                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2827                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2828                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2829                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2830         }
2831         if(x < endx)
2832         {
2833                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2834                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2835                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2836                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2837         }
2838 #endif
2839 }
2840
2841 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2842 {
2843 #ifdef SSE2_PRESENT
2844         int x, startx = span->startx, endx = span->endx;
2845         for (x = startx;x+2 <= endx;x+=2)
2846         {
2847                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2848                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2849                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2850                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2851         }
2852         if(x < endx)
2853         {
2854                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2855                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2856                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2857                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2858         }
2859 #endif
2860 }
2861
2862 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2863 {
2864 #ifdef SSE2_PRESENT
2865         int x, startx = span->startx, endx = span->endx;
2866         for (x = startx;x+2 <= endx;x+=2)
2867         {
2868                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2869                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2870                 pix1 = _mm_add_epi16(pix1, pix2);
2871                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2872         }
2873         if(x < endx)
2874         {
2875                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2876                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2877                 pix1 = _mm_add_epi16(pix1, pix2);
2878                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2879         }
2880 #endif
2881 }
2882
2883 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2884 {
2885 #ifdef SSE2_PRESENT
2886         int x, startx = span->startx, endx = span->endx;
2887         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2888         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2889         for (x = startx;x+2 <= endx;x+=2)
2890         {
2891                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2892                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2893                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2894                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2895         }
2896         if(x < endx)
2897         {
2898                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2899                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2900                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2901                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2902         }
2903 #endif
2904 }
2905
2906 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2907 {
2908 #ifdef SSE2_PRESENT
2909         int x, startx = span->startx, endx = span->endx;
2910         for (x = startx;x+2 <= endx;x+=2)
2911         {
2912                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2913                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2914                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2915                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2916                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2917         }
2918         if(x < endx)
2919         {
2920                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2921                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2922                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2923                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2924                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2925         }
2926 #endif
2927 }
2928
2929 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2930 {
2931 #ifdef SSE2_PRESENT
2932         int x, startx = span->startx, endx = span->endx;
2933         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2934         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2935         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2936         for (x = startx;x+2 <= endx;x+=2)
2937         {
2938                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2939                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2940                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2941         }
2942         if(x < endx)
2943         {
2944                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2945                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2946                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2947         }
2948 #endif
2949 }
2950
2951
2952
2953 void DPSOFTRAST_VertexShader_Generic(void)
2954 {
2955         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2956         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
2957         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
2958         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2959                 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
2960 }
2961
2962 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
2963 {
2964         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2965         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2966         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2967         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2968         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
2969         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
2970         {
2971                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2972                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2973                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
2974                 {
2975                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2976                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2977                         {
2978                                 // multiply
2979                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2980                         }
2981                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2982                         {
2983                                 // add
2984                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2985                         }
2986                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2987                         {
2988                                 // alphablend
2989                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2990                         }
2991                 }
2992         }
2993         else
2994                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
2995         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
2996 }
2997
2998
2999
3000 void DPSOFTRAST_VertexShader_PostProcess(void)
3001 {
3002         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3003         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3004         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3005 }
3006
3007 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3008 {
3009         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3010         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3011         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3012         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3013         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3014         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3015         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3016         {
3017                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3018                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3019         }
3020         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3021         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3022         {
3023                 // TODO: implement saturation
3024         }
3025         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3026         {
3027                 // TODO: implement gammaramps
3028         }
3029         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3030 }
3031
3032
3033
3034 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3035 {
3036         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3037 }
3038
3039 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3040 {
3041         // this is never called (because colormask is off when this shader is used)
3042         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3043         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3044         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3045         memset(buffer_FragColorbgra8, 0, span->length*4);
3046         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3047 }
3048
3049
3050
3051 void DPSOFTRAST_VertexShader_FlatColor(void)
3052 {
3053         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3054         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3055 }
3056
3057 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3058 {
3059         int x, startx = span->startx, endx = span->endx;
3060         int Color_Ambienti[4];
3061         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3062         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3063         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3064         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3065         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3066         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3067         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3068         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3069         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3070         for (x = startx;x < endx;x++)
3071         {
3072                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3073                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3074                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3075                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3076         }
3077         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3078 }
3079
3080
3081
3082 void DPSOFTRAST_VertexShader_VertexColor(void)
3083 {
3084         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3085         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3086         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3087 }
3088
3089 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3090 {
3091 #ifdef SSE2_PRESENT
3092         unsigned char * RESTRICT pixelmask = span->pixelmask;
3093         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3094         int x, startx = span->startx, endx = span->endx;
3095         __m128i Color_Ambientm, Color_Diffusem;
3096         __m128 data, slope;
3097         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3098         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3099         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3100         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3101         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3102         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3103         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3104                 pixel = buffer_FragColorbgra8;
3105         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3106         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3107         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3108         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3109         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3110         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3111         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3112         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3113         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3114         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3115         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3116         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3117         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3118         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3119         {
3120                 __m128i color, mod, pix;
3121                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3122                 {
3123                         __m128i pix2, mod2;
3124                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3125                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3126                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3127                         data = _mm_add_ps(data, slope);
3128                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3129                         data = _mm_add_ps(data, slope);
3130                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3131                         data = _mm_add_ps(data, slope);
3132                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3133                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3134                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3135                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3136                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3137                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3138                         x += 3;
3139                         continue;
3140                 }
3141                 if(!pixelmask[x])
3142                         continue;
3143                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3144                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3145                 mod = _mm_packs_epi32(mod, mod);
3146                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3147                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3148         }
3149         if(pixel == buffer_FragColorbgra8)
3150                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3151 #endif
3152 }
3153
3154
3155
3156 void DPSOFTRAST_VertexShader_Lightmap(void)
3157 {
3158         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3159         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3160         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3161 }
3162
3163 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3164 {
3165 #ifdef SSE2_PRESENT
3166         unsigned char * RESTRICT pixelmask = span->pixelmask;
3167         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3168         int x, startx = span->startx, endx = span->endx;
3169         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3170         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3171         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3173         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3174         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3176         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3177         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3178         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3179                 pixel = buffer_FragColorbgra8;
3180         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3181         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3182         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3183         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3184         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3185         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3186         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3187         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3188         {
3189                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3190                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3191                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3192                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3193                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3194                 for (x = startx;x < endx;x++)
3195                 {
3196                         __m128i color, lightmap, glow, pix;
3197                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3198                         {
3199                                 __m128i pix2;
3200                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3201                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3202                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3203                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3204                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3205                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3206                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3207                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3208                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3209                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3210                                 x += 3;
3211                                 continue;
3212                         }
3213                         if(!pixelmask[x])
3214                                 continue;
3215                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3216                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3217                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3218                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3219                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3220                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3221                 }
3222         }
3223         else
3224         {
3225                 for (x = startx;x < endx;x++)
3226                 {
3227                         __m128i color, lightmap, pix;
3228                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3229                         {
3230                                 __m128i pix2;
3231                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3232                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3233                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3234                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3235                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3236                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3237                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3238                                 x += 3;
3239                                 continue;
3240                         }
3241                         if(!pixelmask[x]) 
3242                                 continue;
3243                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3244                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3245                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3246                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3247                 }
3248         }
3249         if(pixel == buffer_FragColorbgra8)
3250                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3251 #endif
3252 }
3253
3254
3255
3256 void DPSOFTRAST_VertexShader_FakeLight(void)
3257 {
3258         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3259 }
3260
3261 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3262 {
3263         // TODO: IMPLEMENT
3264         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3265         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3266         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267         memset(buffer_FragColorbgra8, 0, span->length*4);
3268         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3269 }
3270
3271
3272
3273 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3274 {
3275         DPSOFTRAST_VertexShader_Lightmap();
3276 }
3277
3278 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3279 {
3280         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3281         // TODO: IMPLEMENT
3282 }
3283
3284
3285
3286 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3287 {
3288         DPSOFTRAST_VertexShader_Lightmap();
3289 }
3290
3291 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3292 {
3293         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3294         // TODO: IMPLEMENT
3295 }
3296
3297
3298
3299 void DPSOFTRAST_VertexShader_LightDirection(void)
3300 {
3301         int i;
3302         int numvertices = dpsoftrast.numvertices;
3303         float LightDir[4];
3304         float LightVector[4];
3305         float EyePosition[4];
3306         float EyeVectorModelSpace[4];
3307         float EyeVector[4];
3308         float position[4];
3309         float svector[4];
3310         float tvector[4];
3311         float normal[4];
3312         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3313         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3314         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3315         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3316         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3317         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3318         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3319         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3320         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3321         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3322         for (i = 0;i < numvertices;i++)
3323         {
3324                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3325                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3326                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3327                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3328                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3329                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3330                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3331                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3332                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3333                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3334                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3335                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3336                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3337                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3338                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3339                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3340                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3341                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3342                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3343                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3344                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3345                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3346                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3347                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3348                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3349                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3350                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3351                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3352                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3353         }
3354 }
3355
3356 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3357 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3358 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3359 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3360 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3361 #define DPSOFTRAST_Vector3Normalize(v)\
3362 do\
3363 {\
3364         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3365         if (len)\
3366         {\
3367                 len = 1.0f / len;\
3368                 v[0] *= len;\
3369                 v[1] *= len;\
3370                 v[2] *= len;\
3371         }\
3372 }\
3373 while(0)
3374
3375 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3376 {
3377         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3378         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3384         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385         int x, startx = span->startx, endx = span->endx;
3386         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3387         float LightVectordata[4];
3388         float LightVectorslope[4];
3389         float EyeVectordata[4];
3390         float EyeVectorslope[4];
3391         float z;
3392         float diffusetex[4];
3393         float glosstex[4];
3394         float surfacenormal[4];
3395         float lightnormal[4];
3396         float eyenormal[4];
3397         float specularnormal[4];
3398         float diffuse;
3399         float specular;
3400         float SpecularPower;
3401         int d[4];
3402         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3403         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3404         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3405         Color_Glow[3] = 0.0f;
3406         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3407         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3408         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3409         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3410         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3411         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3412         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3413         Color_Pants[3] = 0.0f;
3414         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3415         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3416         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3417         Color_Shirt[3] = 0.0f;
3418         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3419         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3420         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3421         {
3422                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3423                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3424         }
3425         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3426         {
3427                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3428         }
3429         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3430         {
3431                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3432                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3433                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3434                 Color_Diffuse[3] = 0.0f;
3435                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3436                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3437                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3438                 LightColor[3] = 0.0f;
3439                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3440                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3441                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3442                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3443                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3444                 Color_Specular[3] = 0.0f;
3445                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3446                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3447                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3448                 for (x = startx;x < endx;x++)
3449                 {
3450                         z = buffer_z[x];
3451                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3452                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3453                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3454                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3455                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3456                         {
3457                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3458                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3459                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3460                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3461                         }
3462                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3463                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3464                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3465                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3466                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3467                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3468                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3469                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3470
3471                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3472                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3473                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3474                         DPSOFTRAST_Vector3Normalize(lightnormal);
3475
3476                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3477                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3478                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3479                         DPSOFTRAST_Vector3Normalize(eyenormal);
3480
3481                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3482                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3483                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3484                         DPSOFTRAST_Vector3Normalize(specularnormal);
3485
3486                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3487                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3488                         specular = pow(specular, SpecularPower * glosstex[3]);
3489                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3490                         {
3491                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3492                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3493                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3494                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3495                         }
3496                         else
3497                         {
3498                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3499                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3500                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3501                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3502                         }
3503                         buffer_FragColorbgra8[x*4+0] = d[0];
3504                         buffer_FragColorbgra8[x*4+1] = d[1];
3505                         buffer_FragColorbgra8[x*4+2] = d[2];
3506                         buffer_FragColorbgra8[x*4+3] = d[3];
3507                 }
3508         }
3509         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3510         {
3511                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3512                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3513                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3514                 Color_Diffuse[3] = 0.0f;
3515                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3516                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3517                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3518                 LightColor[3] = 0.0f;
3519                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3520                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3521                 for (x = startx;x < endx;x++)
3522                 {
3523                         z = buffer_z[x];
3524                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3525                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3526                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3527                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3528                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3529                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3530                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3531                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3532
3533                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3534                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3535                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3536                         DPSOFTRAST_Vector3Normalize(lightnormal);
3537
3538                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3539                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3540                         {
3541                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3542                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3543                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3544                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3545                         }
3546                         else
3547                         {
3548                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3549                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3550                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3551                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3552                         }
3553                         buffer_FragColorbgra8[x*4+0] = d[0];
3554                         buffer_FragColorbgra8[x*4+1] = d[1];
3555                         buffer_FragColorbgra8[x*4+2] = d[2];
3556                         buffer_FragColorbgra8[x*4+3] = d[3];
3557                 }
3558         }
3559         else
3560         {
3561                 for (x = startx;x < endx;x++)
3562                 {
3563                         z = buffer_z[x];
3564                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3565                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3566                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3567                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3568
3569                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3570                         {
3571                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3572                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3573                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3574                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3575                         }
3576                         else
3577                         {
3578                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3579                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3580                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3581                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3582                         }
3583                         buffer_FragColorbgra8[x*4+0] = d[0];
3584                         buffer_FragColorbgra8[x*4+1] = d[1];
3585                         buffer_FragColorbgra8[x*4+2] = d[2];
3586                         buffer_FragColorbgra8[x*4+3] = d[3];
3587                 }
3588         }
3589         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3590 }
3591
3592
3593
3594 void DPSOFTRAST_VertexShader_LightSource(void)
3595 {
3596         int i;
3597         int numvertices = dpsoftrast.numvertices;
3598         float LightPosition[4];
3599         float LightVector[4];
3600         float LightVectorModelSpace[4];
3601         float EyePosition[4];
3602         float EyeVectorModelSpace[4];
3603         float EyeVector[4];
3604         float position[4];
3605         float svector[4];
3606         float tvector[4];
3607         float normal[4];
3608         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3609         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3610         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3611         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3612         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3613         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3614         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3615         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3616         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3617         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3618         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3619         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3620         for (i = 0;i < numvertices;i++)
3621         {
3622                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3623                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3624                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3625                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3626                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3627                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3628                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3629                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3630                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3631                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3632                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3633                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3634                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3635                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3636                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3637                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3638                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3639                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3641                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3642                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3643                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3644                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3645                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3646                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3647                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3648                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3649                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3650                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3651                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3652                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3653                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3654         }
3655 }
3656
3657 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3658 {
3659 #ifdef SSE2_PRESENT
3660         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3661         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668         int x, startx = span->startx, endx = span->endx;
3669         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3670         float CubeVectordata[4];
3671         float CubeVectorslope[4];
3672         float LightVectordata[4];
3673         float LightVectorslope[4];
3674         float EyeVectordata[4];
3675         float EyeVectorslope[4];
3676         float z;
3677         float diffusetex[4];
3678         float glosstex[4];
3679         float surfacenormal[4];
3680         float lightnormal[4];
3681         float eyenormal[4];
3682         float specularnormal[4];
3683         float diffuse;
3684         float specular;
3685         float SpecularPower;
3686         float CubeVector[4];
3687         float attenuation;
3688         int d[4];
3689         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3690         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3691         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3692         Color_Glow[3] = 0.0f;
3693         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3694         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3695         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3696         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3697         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700         Color_Diffuse[3] = 0.0f;
3701         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3702         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3703         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3704         Color_Specular[3] = 0.0f;
3705         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3706         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3707         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3708         Color_Pants[3] = 0.0f;
3709         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3710         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3711         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3712         Color_Shirt[3] = 0.0f;
3713         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3714         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3715         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3716         LightColor[3] = 0.0f;
3717         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3718         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3719         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3720         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3721         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3722         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3723         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3725         {
3726                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3727                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3728         }
3729         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3730                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3731         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3732         {
3733                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3734                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3735                 for (x = startx;x < endx;x++)
3736                 {
3737                         z = buffer_z[x];
3738                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3739                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3740                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3741                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3742                         if (attenuation < 0.01f)
3743                                 continue;
3744                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3745                         {
3746                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3747                                 if (attenuation < 0.01f)
3748                                         continue;
3749                         }
3750
3751                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3752                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3753                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3754                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3755                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3756                         {
3757                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3758                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3759                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3760                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3761                         }
3762                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3763                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3764                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3765                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3766                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3767                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3768                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3769                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3770
3771                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3772                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3773                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3774                         DPSOFTRAST_Vector3Normalize(lightnormal);
3775
3776                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3777                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3778                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3779                         DPSOFTRAST_Vector3Normalize(eyenormal);
3780
3781                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3782                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3783                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3784                         DPSOFTRAST_Vector3Normalize(specularnormal);
3785
3786                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3787                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3788                         specular = pow(specular, SpecularPower * glosstex[3]);
3789                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3790                         {
3791                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3792                                 attenuation *= (1.0f / 255.0f);
3793                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3794                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3795                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3796                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3797                         }
3798                         else
3799                         {
3800                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3801                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3802                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3803                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3804                         }
3805                         buffer_FragColorbgra8[x*4+0] = d[0];
3806                         buffer_FragColorbgra8[x*4+1] = d[1];
3807                         buffer_FragColorbgra8[x*4+2] = d[2];
3808                         buffer_FragColorbgra8[x*4+3] = d[3];
3809                 }
3810         }
3811         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3812         {
3813                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3814                 for (x = startx;x < endx;x++)
3815                 {
3816                         z = buffer_z[x];
3817                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3818                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3819                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3820                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3821                         if (attenuation < 0.01f)
3822                                 continue;
3823                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3824                         {
3825                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3826                                 if (attenuation < 0.01f)
3827                                         continue;
3828                         }
3829
3830                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3831                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3832                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3833                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3834                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3835                         {
3836                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3837                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3838                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3839                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3840                         }
3841                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3842                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3843                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3844                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3845
3846                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3847                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3848                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3849                         DPSOFTRAST_Vector3Normalize(lightnormal);
3850
3851                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3852                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3853                         {
3854                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3855                                 attenuation *= (1.0f / 255.0f);
3856                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3857                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3858                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3859                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3860                         }
3861                         else
3862                         {
3863                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3864                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3865                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3866                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3867                         }
3868                         buffer_FragColorbgra8[x*4+0] = d[0];
3869                         buffer_FragColorbgra8[x*4+1] = d[1];
3870                         buffer_FragColorbgra8[x*4+2] = d[2];
3871                         buffer_FragColorbgra8[x*4+3] = d[3];
3872                 }
3873         }
3874         else
3875         {
3876                 for (x = startx;x < endx;x++)
3877                 {
3878                         z = buffer_z[x];
3879                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3880                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3881                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3882                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3883                         if (attenuation < 0.01f)
3884                                 continue;
3885                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3886                         {
3887                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3888                                 if (attenuation < 0.01f)
3889                                         continue;
3890                         }
3891
3892                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3893                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3894                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3895                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3896                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3897                         {
3898                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3899                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3900                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3901                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3902                         }
3903                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3904                         {
3905                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3906                                 attenuation *= (1.0f / 255.0f);
3907                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3908                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3909                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3910                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
3911                         }
3912                         else
3913                         {
3914                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3915                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3916                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3917                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3918                         }
3919                         buffer_FragColorbgra8[x*4+0] = d[0];
3920                         buffer_FragColorbgra8[x*4+1] = d[1];
3921                         buffer_FragColorbgra8[x*4+2] = d[2];
3922                         buffer_FragColorbgra8[x*4+3] = d[3];
3923                 }
3924         }
3925         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3926 #endif
3927 }
3928
3929
3930
3931 void DPSOFTRAST_VertexShader_Refraction(void)
3932 {
3933         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3934 }
3935
3936 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3937 {
3938         // TODO: IMPLEMENT
3939         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3940         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3941         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3942         memset(buffer_FragColorbgra8, 0, span->length*4);
3943         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3944 }
3945
3946
3947
3948 void DPSOFTRAST_VertexShader_Water(void)
3949 {
3950         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3951 }
3952
3953
3954 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3955 {
3956         // TODO: IMPLEMENT
3957         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3958         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3959         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3960         memset(buffer_FragColorbgra8, 0, span->length*4);
3961         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3962 }
3963
3964
3965
3966 void DPSOFTRAST_VertexShader_ShowDepth(void)
3967 {
3968         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3969 }
3970
3971 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3972 {
3973         // TODO: IMPLEMENT
3974         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3975         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3976         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3977         memset(buffer_FragColorbgra8, 0, span->length*4);
3978         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3979 }
3980
3981
3982
3983 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
3984 {
3985         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3986 }
3987
3988 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3989 {
3990         // TODO: IMPLEMENT
3991         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3992         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3993         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3994         memset(buffer_FragColorbgra8, 0, span->length*4);
3995         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3996 }
3997
3998
3999
4000 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4001 {
4002         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4003 }
4004
4005 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4006 {
4007         // TODO: IMPLEMENT
4008         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4009         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4010         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4011         memset(buffer_FragColorbgra8, 0, span->length*4);
4012         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4013 }
4014
4015
4016
4017 typedef struct DPSOFTRAST_ShaderModeInfo_s
4018 {
4019         int lodarrayindex;
4020         void (*Vertex)(void);
4021         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4022         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4023         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4024 }
4025 DPSOFTRAST_ShaderModeInfo;
4026
4027 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4028 {
4029         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4030         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4031         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4032         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4033         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4034         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4035         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4036         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4037         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4038         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4039         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4040         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4041         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4042         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4043         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4044         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4045 };
4046
4047
4048 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4049 {
4050         while (commandoffset != endoffset)
4051         {
4052                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4053                 switch (command->opcode)
4054                 {
4055 #define INTERPCOMMAND(name) \
4056                 case DPSOFTRAST_OPCODE_##name : \
4057                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4058                         commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4059                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4060                                 commandoffset = 0; \
4061                         break;
4062                 INTERPCOMMAND(Viewport)
4063                 INTERPCOMMAND(ClearColor)
4064                 INTERPCOMMAND(ClearDepth)
4065                 INTERPCOMMAND(ColorMask)
4066                 INTERPCOMMAND(DepthTest)
4067                 INTERPCOMMAND(ScissorTest)
4068                 INTERPCOMMAND(Scissor)
4069                 INTERPCOMMAND(BlendFunc)
4070                 INTERPCOMMAND(BlendSubtract)
4071                 INTERPCOMMAND(DepthMask)
4072                 INTERPCOMMAND(DepthFunc)
4073                 INTERPCOMMAND(DepthRange)
4074                 INTERPCOMMAND(PolygonOffset)
4075                 INTERPCOMMAND(AlphaTest)
4076                 INTERPCOMMAND(AlphaFunc)
4077                 INTERPCOMMAND(SetTexture)
4078                 INTERPCOMMAND(SetShader)
4079                 INTERPCOMMAND(Uniform4f)
4080                 INTERPCOMMAND(UniformMatrix4f)
4081                 INTERPCOMMAND(Uniform1i)
4082
4083                 case DPSOFTRAST_OPCODE_Reset:
4084                         commandoffset = 0;
4085                         break;
4086                 }
4087         }
4088         return commandoffset;
4089 }
4090                                         
4091 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4092 {
4093         int i;
4094         int x;
4095         int startx;
4096         int endx;
4097 //      unsigned int c;
4098 //      unsigned int *colorpixel;
4099         unsigned int *depthpixel;
4100         float w;
4101         float wslope;
4102         int depth;
4103         int depthslope;
4104         unsigned int d;
4105         DPSOFTRAST_State_Triangle *triangle;
4106         DPSOFTRAST_State_Span *span;
4107         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4108         for (i = 0; i < thread->numspans; i++)
4109         {
4110                 span = &thread->spans[i];
4111                 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4112                 if (commandoffset != triangle->commandoffset)
4113                 {
4114                         commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4115                         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4116                 }
4117                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4118                 {
4119                         wslope = triangle->w[0];
4120                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4121                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4122                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4123                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4124                         switch(thread->fb_depthfunc)
4125                         {
4126                         default:
4127                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4128                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4129                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4130                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4131                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4132                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4133                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4134                         }
4135                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4136                         //for (x = 0;x < span->length;x++)
4137                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4138                         // if there is no color buffer, skip pixel shader
4139                         startx = 0;
4140                         endx = span->length;
4141                         while (startx < endx && !pixelmask[startx])
4142                                 startx++;
4143                         while (endx > startx && !pixelmask[endx-1])
4144                                 endx--;
4145                         if (startx >= endx)
4146                                 continue; // no pixels to fill
4147                         span->pixelmask = pixelmask;
4148                         span->startx = startx;
4149                         span->endx = endx;
4150                         // run pixel shader if appropriate
4151                         // do this before running depthmask code, to allow the pixelshader
4152                         // to clear pixelmask values for alpha testing
4153                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4154                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4155                         if (thread->depthmask)
4156                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4157                                         if (pixelmask[x])
4158                                                 depthpixel[x] = d;
4159                 }
4160                 else
4161                 {
4162                         // no depth testing means we're just dealing with color...
4163                         // if there is no color buffer, skip pixel shader
4164                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4165                         {
4166                                 memset(pixelmask, 1, span->length);
4167                                 span->pixelmask = pixelmask;
4168                                 span->startx = 0;
4169                                 span->endx = span->length;
4170                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4171                         }
4172                 }
4173         }
4174         thread->numspans = 0;
4175         return commandoffset;
4176 }
4177
4178 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4179 {
4180 #ifdef SSE2_PRESENT
4181         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4182         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4183         int commandoffset = thread->commandoffset;
4184         int triangleoffset = thread->triangleoffset;
4185         DPSOFTRAST_State_Triangle *triangle = NULL;
4186         int starty;
4187         int endy;
4188         int y;
4189         int numpoints;
4190         __m128 coords[4];
4191         __m128i ycoords;
4192         while (triangleoffset != freetriangle)
4193         {
4194                 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4195                 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4196                         triangleoffset = 0;
4197                 starty = triangle->starty + 1;
4198                 endy = triangle->endy;
4199                 if (starty >= maxy || endy <= miny)
4200                         continue;
4201                 numpoints = triangle->numpoints;
4202                 coords[0] = _mm_load_ps(triangle->coords[0]);
4203                 coords[1] = _mm_load_ps(triangle->coords[1]);
4204                 coords[2] = _mm_load_ps(triangle->coords[2]);
4205                 coords[3] = _mm_load_ps(triangle->coords[3]);
4206                 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4207                 if (starty < miny)
4208                         starty = miny;
4209                 if (endy > maxy)
4210                         endy = maxy;
4211                 for (y = starty; y < endy;)
4212                 {
4213                         __m128 xcoords, xslope;
4214                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4215                         int yccmask = _mm_movemask_epi8(ycc);
4216                         int edge0p, edge0n, edge1p, edge1n;
4217                         int nexty;
4218                         if (numpoints == 4)
4219                         {
4220                                 switch(yccmask)
4221                                 {
4222                                 default:
4223                                 case 0xFFFF: /*0000*/ y = endy; continue;
4224                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4225                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4226                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4227                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4228                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4229                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4230                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4231                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4232                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4233                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4234                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4235                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4236                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4237                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4238                                 case 0x0000: /*1111*/ y++; continue;
4239                                 }
4240                         }
4241                         else
4242                         {
4243                                 switch(yccmask)
4244                                 {
4245                                 default:
4246                                 case 0xFFFF: /*000*/ y = endy; continue;
4247                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4248                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4249                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4250                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4251                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4252                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4253                                 case 0x0000: /*111*/ y++; continue;
4254                                 }
4255                         }
4256                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4257                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4258                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4259                         nexty = _mm_extract_epi16(ycc, 0);
4260                         if(nexty >= endy) nexty = endy-1;
4261                         if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4262                         {
4263                                 int tmp = edge0n;
4264                                 edge0n = edge1n;
4265                                 edge1n = tmp;
4266                                 tmp = edge0p;
4267                                 edge0p = edge1p;
4268                                 edge1p = tmp;
4269                         }
4270                         xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4271                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4272                         xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4273                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4274                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4275                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4276                         {
4277                                 int startx, endx, offset;
4278                                 startx = _mm_cvtss_si32(xcoords);
4279                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4280                                 if (startx < 0) startx = 0;
4281                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4282                                 if (startx >= endx) continue;
4283                                 for (offset = startx; offset < endx;)
4284                                 {
4285                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4286                                         span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4287                                         span->x = offset;
4288                                         span->y = y;
4289                                         span->length = endx - offset;
4290                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4291                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4292                                         offset += span->length;
4293                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4294                                                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4295                                 }
4296                         }
4297                 }
4298         }
4299
4300         if (thread->numspans > 0)
4301                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4302         if (commandoffset != triangle->commandoffset)
4303         {
4304                 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4305                 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4306         }
4307         
4308         MEMORY_BARRIER;
4309
4310         thread->commandoffset = commandoffset;
4311         thread->triangleoffset = triangleoffset;
4312 #endif
4313 }
4314
4315 void DPSOFTRAST_Draw_FlushThreads(void)
4316 {
4317         DPSOFTRAST_State_Thread *thread;
4318         int i;
4319         if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4320         {
4321                 MEMORY_BARRIER;
4322                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4323         }
4324 #ifdef USE_THREADS
4325         SDL_LockMutex(dpsoftrast.trianglemutex);
4326 #endif
4327         for (i = 0; i < dpsoftrast.numthreads; i++)
4328         {
4329                 thread = &dpsoftrast.threads[i];
4330 #ifdef USE_THREADS
4331                 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4332                 {
4333                         thread->waiting = true;
4334                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4335                         SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4336                         thread->waiting = false;
4337                 }
4338 #else
4339                 if (thread->triangleoffset != dpsoftrast.drawtriangle) 
4340                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4341 #endif
4342         }
4343 #ifdef USE_THREADS
4344         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4345 #endif
4346         dpsoftrast.trianglepool.usedtriangles = 0;
4347         dpsoftrast.commandpool.usedcommands = 0;
4348 }
4349
4350 #ifdef USE_THREADS
4351 static int DPSOFTRAST_Draw_Thread(void *data)
4352 {
4353         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4354         while(thread->index >= 0)
4355         {
4356                 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4357                 {
4358                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle); 
4359                 }
4360                 else 
4361                 {
4362                         SDL_LockMutex(dpsoftrast.trianglemutex);
4363                         if (thread->triangleoffset != dpsoftrast.drawtriangle)
4364                         {
4365                                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4366                                 continue;
4367                         }
4368                         if (thread->waiting) SDL_CondSignal(thread->waitcond);
4369                         SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4370                         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4371                 }
4372         }   
4373         return 0;
4374 }
4375 #endif
4376
4377 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4378 {
4379 #ifdef SSE2_PRESENT
4380         int cullface = dpsoftrast.cullface;
4381         int width = dpsoftrast.fb_width;
4382         int height = dpsoftrast.fb_height;
4383         __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4384         DPSOFTRAST_State_Triangle *triangle;
4385         int numqueued = 0;
4386         int i;
4387         int j;
4388         int k;
4389         int y;
4390         int e[3];
4391         __m128i screeny;
4392         int starty, endy;
4393         int numpoints;
4394         int clipcase;
4395         float clipdist[4];
4396         __m128 triangleedge1, triangleedge2, trianglenormal;
4397         __m128 clipfrac[3];
4398         __m128 screen[4];
4399         DPSOFTRAST_Texture *texture;
4400         screen[3] = _mm_setzero_ps();
4401         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps(); 
4402         for (i = 0;i < numtriangles;i++)
4403         {
4404                 // generate the 3 edges of this triangle
4405                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4406                 if (element3i)
4407                 {
4408                         e[0] = element3i[i*3+0] - firstvertex;
4409                         e[1] = element3i[i*3+1] - firstvertex;
4410                         e[2] = element3i[i*3+2] - firstvertex;
4411                 }
4412                 else if (element3s)
4413                 {
4414                         e[0] = element3s[i*3+0] - firstvertex;
4415                         e[1] = element3s[i*3+1] - firstvertex;
4416                         e[2] = element3s[i*3+2] - firstvertex;
4417                 }
4418                 else
4419                 {
4420                         e[0] = i*3+0;
4421                         e[1] = i*3+1;
4422                         e[2] = i*3+2;
4423                 }
4424
4425 #define SKIPBACKFACE \
4426                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4427                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4428                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4429                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4430                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4431                 switch(cullface) \
4432                 { \
4433                 case GL_BACK: \
4434                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4435                                 continue; \
4436                         break; \
4437                 case GL_FRONT: \
4438                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4439                                 continue; \
4440                         break; \
4441                 }
4442                         //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4443                         //                                                _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4444                         //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4445                         //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4446                         //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4447
4448                         // macros for clipping vertices
4449
4450 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4451                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4452                         { \
4453                                 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4454                                 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4455                         }
4456 #define CLIPPEDVERTEXCOPY(k,p1) \
4457                         screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4458
4459 #define GENATTRIBCOPY(j, attrib, p1) \
4460                 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4461 #define GENATTRIBLERP(j, attrib, p1, p2) \
4462                 { \
4463                         __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4464                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4465                 }
4466 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4467                 switch(clipcase) \
4468                 { \
4469                 default: \
4470                 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4471                 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4472                 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4473                 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4474                 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4475                 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4476                 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4477                 }
4478
4479                 // calculate distance from nearplane
4480                 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4481                 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4482                 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4483                 if (clipdist[0] >= 0.0f)
4484                 {
4485                         if (clipdist[1] >= 0.0f)
4486                         {
4487                                 if (clipdist[2] >= 0.0f)
4488                                 {
4489                                         // triangle is entirely in front of nearplane
4490                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4491                                         SKIPBACKFACE;
4492                                         numpoints = 3;
4493                                         clipcase = 0;
4494                                 }
4495                                 else
4496                                 {
4497                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4498                                         SKIPBACKFACE;
4499                                         numpoints = 4;
4500                                         clipcase = 1;
4501                                 }
4502                         }
4503                         else 
4504                         {
4505                                 if (clipdist[2] >= 0.0f)
4506                                 {
4507                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2);     CLIPPEDVERTEXCOPY(3,2);
4508                                         SKIPBACKFACE;
4509                                         numpoints = 4;
4510                                         clipcase = 2;
4511                                 }
4512                                 else
4513                                 {
4514                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4515                                         SKIPBACKFACE;
4516                                         numpoints = 3;
4517                                         clipcase = 3;
4518                                 }
4519                         }
4520                 }                       
4521                 else if (clipdist[1] >= 0.0f)
4522                 {
4523                         if (clipdist[2] >= 0.0f)
4524                         {
4525                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4526                                 SKIPBACKFACE;
4527                                 numpoints = 4;
4528                                 clipcase = 4;
4529                         }
4530                         else
4531                         {
4532                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4533                                 SKIPBACKFACE;
4534                                 numpoints = 3;
4535                                 clipcase = 5;
4536                         }
4537                 }
4538                 else if (clipdist[2] >= 0.0f)
4539                 {
4540                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4541                         SKIPBACKFACE;
4542                         numpoints = 3;
4543                         clipcase = 6;
4544                 }
4545                 else continue; // triangle is entirely behind nearplane
4546
4547                 {
4548                         // calculate integer y coords for triangle points
4549                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4550                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)), 
4551                                         screenmin = _mm_min_epi16(screeni, screenir), 
4552                                         screenmax = _mm_max_epi16(screeni, screenir);
4553                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4554                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4555                         screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4556                         screenmax = _mm_min_epi16(screenmax, fbmax);
4557                         // skip offscreen triangles
4558                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4559                                 continue;
4560                         starty = _mm_extract_epi16(screenmin, 1);
4561                         endy = _mm_extract_epi16(screenmax, 1)+1;
4562                         screeny = _mm_srai_epi32(screeni, 16);
4563                 }
4564
4565                 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4566 #ifdef USE_THREADS
4567                         DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4568 #else
4569                         DPSOFTRAST_Draw_FlushThreads();
4570 #endif
4571
4572                 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4573                 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4574                 triangle->starty = starty;
4575                 triangle->endy = endy;
4576                 triangle->numpoints = numpoints;
4577                 _mm_store_ps(triangle->coords[0], screen[0]);
4578                 _mm_store_ps(triangle->coords[1], screen[1]);
4579                 _mm_store_ps(triangle->coords[2], screen[2]);
4580                 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4581                 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4582
4583                 // calculate attribute plans for triangle data...
4584                 // okay, this triangle is going to produce spans, we'd better project
4585                 // the interpolants now (this is what gives perspective texturing),
4586                 // this consists of simply multiplying all arrays by the W coord
4587                 // (which is basically 1/Z), which will be undone per-pixel
4588                 // (multiplying by Z again) to get the perspective-correct array
4589                 // values
4590                 {
4591                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4592                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4593                 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4594                 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4595                 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4596                 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4597                 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4598                 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4599                 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4600                         attribedge1 = _mm_sub_ss(w0, w1);
4601                         attribedge2 = _mm_sub_ss(w2, w1);
4602                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4603                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4604                 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4605                 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4606                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4607                         _mm_store_ss(&triangle->w[0], attribxslope);
4608                         _mm_store_ss(&triangle->w[1], attribyslope);
4609                         _mm_store_ss(&triangle->w[2], attriborigin);
4610                         for (j = 0;j < numarrays;j++)
4611                         {
4612                                 if (arraymask[j])
4613                                 {
4614                                         __m128 attrib0, attrib1, attrib2;
4615                                         GENATTRIBS(j, attrib0, attrib1, attrib2);
4616                                         attriborigin = _mm_mul_ps(attrib1, w1);
4617                                         attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4618                                         attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4619                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4620                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4621                                         attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4622                                         _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4623                                         _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4624                                 _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4625                         }
4626                     }
4627             }
4628
4629                 // adjust texture LOD by texture density, in the simplest way possible...
4630                 {
4631                         __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4632                         memset(triangle->mip, 0, sizeof(triangle->mip));
4633                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4634                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4635                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4636                         k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4637                         GENATTRIBS(k, attrib0, attrib1, attrib2);
4638                         mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4639                         mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4640                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4641                         {
4642                                 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4643                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4644                                         break;
4645                                 texture = dpsoftrast.texbound[texunit];
4646                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4647                                 {
4648                                         mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4649                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4650                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4651                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4652                                         // this will be multiplied in the texturing routine by the texture resolution
4653                                         y = _mm_cvtss_si32(mipdensity);
4654                                         if (y > 0)
4655                                         {
4656                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4657                                                 if (y > texture->mipmaps - 1)
4658                                                         y = texture->mipmaps - 1;
4659                                                 triangle->mip[texunit] = y;
4660                                         }
4661                                 }
4662                         }
4663                 }
4664
4665             dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4666                 dpsoftrast.trianglepool.usedtriangles++;
4667
4668                 numqueued++;
4669                 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4670                 {
4671                         MEMORY_BARRIER;
4672                         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4673
4674 #ifdef USE_THREADS
4675                         SDL_LockMutex(dpsoftrast.trianglemutex);
4676                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4677                         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4678 #else
4679                         DPSOFTRAST_Draw_FlushThreads();
4680 #endif
4681                         numqueued = 0;
4682                 }
4683         }
4684         if (numqueued > 0)
4685         {
4686                 MEMORY_BARRIER;
4687                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4688
4689 #ifdef USE_THREADS
4690                 SDL_LockMutex(dpsoftrast.trianglemutex);
4691                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4692                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4693 #else
4694                 DPSOFTRAST_Draw_FlushThreads();
4695 #endif
4696         }
4697 #endif
4698 }
4699
4700 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4701 {
4702         int i;
4703         int lastarray = DPSOFTRAST_ARRAY_POSITION;
4704         unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4705         memset(arraymask, false, sizeof(arraymask));
4706         arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4707         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4708         {
4709                 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4710                 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4711                         break;
4712                 switch (arrayindex)
4713                 {
4714                         case DPSOFTRAST_ARRAY_POSITION:
4715                         case DPSOFTRAST_ARRAY_COLOR: 
4716                                 break;
4717                         default:
4718                                 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4719                                         continue;
4720                                 break;
4721                 }
4722                 arraymask[arrayindex] = true;
4723                 if (arrayindex > lastarray)
4724                         lastarray = arrayindex;
4725         }
4726         DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4727         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4728 //      DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4729         DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4730 }
4731
4732 void DPSOFTRAST_Flush(void)
4733 {
4734         DPSOFTRAST_Draw_SyncCommands();
4735         DPSOFTRAST_Draw_FlushThreads();
4736 }
4737
4738 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4739 {
4740         int i;
4741         union
4742         {
4743                 int i;
4744                 unsigned char b[4];
4745         }
4746         u;
4747         u.i = 1;
4748         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4749         dpsoftrast.bigendian = u.b[3];
4750         dpsoftrast.fb_width = width;
4751         dpsoftrast.fb_height = height;
4752         dpsoftrast.fb_depthpixels = depthpixels;
4753         dpsoftrast.fb_colorpixels[0] = colorpixels;
4754         dpsoftrast.fb_colorpixels[1] = NULL;
4755         dpsoftrast.fb_colorpixels[1] = NULL;
4756         dpsoftrast.fb_colorpixels[1] = NULL;
4757         dpsoftrast.texture_firstfree = 1;
4758         dpsoftrast.texture_end = 1;
4759         dpsoftrast.texture_max = 0;
4760         dpsoftrast.viewport[0] = 0;
4761         dpsoftrast.viewport[1] = 0;
4762         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4763         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4764         dpsoftrast.color[0] = 1;
4765         dpsoftrast.color[1] = 1;
4766         dpsoftrast.color[2] = 1;
4767         dpsoftrast.color[3] = 1;
4768         dpsoftrast.cullface = GL_BACK;
4769 #ifdef USE_THREADS
4770         dpsoftrast.numthreads = bound(1, numthreads, 64);
4771         dpsoftrast.trianglemutex = SDL_CreateMutex();
4772         dpsoftrast.trianglecond = SDL_CreateCond();
4773 #else
4774         dpsoftrast.numthreads = 1;
4775 #endif
4776         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4777         for (i = 0; i < dpsoftrast.numthreads; i++)
4778         {
4779                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4780                 thread->index = i;
4781                 thread->colormask[1] = 1;
4782                 thread->colormask[2] = 1;
4783                 thread->colormask[3] = 1;
4784                 thread->blendfunc[0] = GL_ONE;
4785                 thread->blendfunc[1] = GL_ZERO;
4786                 thread->depthmask = true;
4787                 thread->depthtest = true;
4788                 thread->depthfunc = GL_LEQUAL;
4789                 thread->scissortest = false;
4790                 thread->alphatest = false;
4791                 thread->alphafunc = GL_GREATER;
4792                 thread->alphavalue = 0.5f;
4793                 thread->scissor[0] = 0;
4794                 thread->scissor[1] = 0;
4795                 thread->scissor[2] = dpsoftrast.fb_width;
4796                 thread->scissor[3] = dpsoftrast.fb_height;
4797                 thread->depthrange[0] = 0;
4798                 thread->depthrange[1] = 1;
4799                 thread->polygonoffset[0] = 0;
4800                 thread->polygonoffset[1] = 0;
4801
4802                 thread->numspans = 0;
4803                 thread->triangleoffset = 0;
4804                 thread->commandoffset = 0;
4805                 thread->waiting = false;
4806 #ifdef USE_THREADS
4807                 thread->waitcond = SDL_CreateCond();
4808 #endif
4809
4810                 thread->validate = -1;
4811                 DPSOFTRAST_Validate(thread, -1);
4812 #ifdef USE_THREADS
4813                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4814 #endif
4815         }
4816 }
4817
4818 void DPSOFTRAST_Shutdown(void)
4819 {
4820         int i;
4821 #ifdef USE_THREADS
4822         if(dpsoftrast.numthreads > 0)
4823         {
4824                 DPSOFTRAST_State_Thread *thread;
4825                 SDL_LockMutex(dpsoftrast.trianglemutex);
4826                 for (i = 0; i < dpsoftrast.numthreads; i++)
4827                 {
4828                         thread = &dpsoftrast.threads[i];
4829                         thread->index = -1;
4830                 }
4831                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4832                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4833                 for (i = 0; i < dpsoftrast.numthreads; i++)
4834                 {
4835                         thread = &dpsoftrast.threads[i];
4836                         SDL_WaitThread(thread->thread, NULL);
4837                         SDL_DestroyCond(thread->waitcond);
4838                 }
4839                 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4840                 SDL_DestroyCond(dpsoftrast.trianglecond);
4841         }
4842 #endif
4843         for (i = 0;i < dpsoftrast.texture_end;i++)
4844                 if (dpsoftrast.texture[i].bytes)
4845                         MM_FREE(dpsoftrast.texture[i].bytes);
4846         if (dpsoftrast.texture)
4847                 free(dpsoftrast.texture);
4848         if (dpsoftrast.threads)
4849                 MM_FREE(dpsoftrast.threads);
4850         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4851 }
4852