]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
threading and more SSE2 optimizations for dpsoftrast
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 //#define USETHREADS
9 #ifdef USETHREADS
10 #include <SDL.h>
11 #include <SDL_thread.h>
12 #endif
13
14 #ifndef __cplusplus
15 typedef qboolean bool;
16 #endif
17
18 #define ALIGN_SIZE 16
19 #define ATOMIC_SIZE 32
20
21 #if defined(__GNUC__)
22 #define ALIGN(var) var __attribute__((__aligned__(16)))
23 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(MemoryBarrier())
31 #else
32 #define ALIGN(var) var
33 #define ATOMIC(var) var
34 #define MEMORY_BARRIER ((void)0)
35 #endif
36
37 #ifndef USETHREADS
38 #undef MEMORY_BARRIER
39 #define MEMORY_BARRIER ((void)0)
40 #endif
41
42 #ifdef SSE2_PRESENT
43 #include <emmintrin.h>
44
45 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
46
47 static void *MM_CALLOC(size_t nmemb, size_t size)
48 {
49         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
50         if(ptr != NULL) memset(ptr, 0, nmemb*size);
51         return ptr;
52 }
53
54 #define MM_FREE _mm_free
55 #else
56 #define MM_MALLOC(size) malloc(size)
57 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
58 #define MM_FREE free
59 #endif
60
61 typedef enum DPSOFTRAST_ARRAY_e
62 {
63         DPSOFTRAST_ARRAY_POSITION,
64         DPSOFTRAST_ARRAY_COLOR,
65         DPSOFTRAST_ARRAY_TEXCOORD0,
66         DPSOFTRAST_ARRAY_TEXCOORD1,
67         DPSOFTRAST_ARRAY_TEXCOORD2,
68         DPSOFTRAST_ARRAY_TEXCOORD3,
69         DPSOFTRAST_ARRAY_TEXCOORD4,
70         DPSOFTRAST_ARRAY_TEXCOORD5,
71         DPSOFTRAST_ARRAY_TEXCOORD6,
72         DPSOFTRAST_ARRAY_TEXCOORD7,
73         DPSOFTRAST_ARRAY_TOTAL
74 }
75 DPSOFTRAST_ARRAY;
76
77 typedef struct DPSOFTRAST_Texture_s
78 {
79         int flags;
80         int width;
81         int height;
82         int depth;
83         int sides;
84         DPSOFTRAST_TEXTURE_FILTER filter;
85         int mipmaps;
86         int size;
87         unsigned char *bytes;
88         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
89 }
90 DPSOFTRAST_Texture;
91
92 #define COMMAND_SIZE ALIGN_SIZE
93 #define COMMAND_ALIGN(var) ALIGN(var)
94
95 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
96 {
97         int opcode;
98 }
99 DPSOFTRAST_Command);
100
101 enum { DPSOFTRAST_OPCODE_Reset = 0 };
102
103 #define DEFCOMMAND(opcodeval, name, fields) \
104         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
105         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
106         { \
107                 int opcode; \
108                 fields \
109         } DPSOFTRAST_Command_##name );
110
111 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
112
113 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
114 {
115         int freecommand;
116         int usedcommands;
117         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
118 }
119 DPSOFTRAST_State_Command_Pool);
120
121 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
122 {
123         int commandoffset;
124         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
125         int starty;
126         int endy;
127         int numpoints;
128         float w[3];
129         ALIGN(float coords[4][4]);
130         ALIGN(int ycoords[4]);
131         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
132 }
133 DPSOFTRAST_State_Triangle);
134
135 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
136         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
137         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
138                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
139                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
140 }
141 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
142         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
143         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
144         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
145         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
146         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
147         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
148         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
149         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
150 }
151                                         
152 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
153
154 typedef ALIGN(struct DPSOFTRAST_State_Span_s
155 {
156         int triangle; // triangle this span was generated by
157         int x; // framebuffer x coord
158         int y; // framebuffer y coord
159         int length; // pixel count
160         int startx; // usable range (according to pixelmask)
161         int endx; // usable range (according to pixelmask)
162         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
163 }
164 DPSOFTRAST_State_Span);
165
166 #define DPSOFTRAST_DRAW_MAXSPANS 1024
167
168 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
169 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
170
171 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
172 {
173         int freetriangle;
174         int usedtriangles;
175         ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
176 }
177 DPSOFTRAST_State_Triangle_Pool);
178
179 #define DPSOFTRAST_VALIDATE_FB 1
180 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
181 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
182 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
183
184 typedef enum DPSOFTRAST_BLENDMODE_e
185 {
186         DPSOFTRAST_BLENDMODE_OPAQUE,
187         DPSOFTRAST_BLENDMODE_ALPHA,
188         DPSOFTRAST_BLENDMODE_ADDALPHA,
189         DPSOFTRAST_BLENDMODE_ADD,
190         DPSOFTRAST_BLENDMODE_INVMOD,
191         DPSOFTRAST_BLENDMODE_MUL,
192         DPSOFTRAST_BLENDMODE_MUL2,
193         DPSOFTRAST_BLENDMODE_SUBALPHA,
194         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
195         DPSOFTRAST_BLENDMODE_TOTAL
196 }
197 DPSOFTRAST_BLENDMODE;
198
199 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
200 {
201 #ifdef USETHREADS
202         SDL_Thread *thread;
203 #endif
204         int index;
205         
206         int colormask[4];
207         int blendfunc[2];
208         int blendsubtract;
209         int depthmask;
210         int depthtest;
211         int depthfunc;
212         int scissortest;
213         int alphatest;
214         int alphafunc;
215         float alphavalue;
216         int scissor[4];
217         int viewport[4];
218         float depthrange[2];
219         float polygonoffset[2];
220
221         int shader_mode;
222         int shader_permutation;
223
224         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
225         
226         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
227         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
228
229         // DPSOFTRAST_VALIDATE_ flags
230         int validate;
231
232         // derived values (DPSOFTRAST_VALIDATE_FB)
233         int fb_colormask;
234         int fb_clearscissor[4];
235
236         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
237         int fb_depthfunc;
238
239         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
240         int fb_blendmode;
241
242         ATOMIC(int commandoffset);
243         int triangleoffset;
244
245         bool waiting;
246 #ifdef USETHREADS
247         SDL_cond *waitcond;
248 #endif
249
250         int numspans;
251         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
252 }
253 DPSOFTRAST_State_Thread);
254
255 typedef ATOMIC(struct DPSOFTRAST_State_s
256 {
257         int fb_width;
258         int fb_height;
259         unsigned int *fb_depthpixels;
260         unsigned int *fb_colorpixels[4];
261
262         int viewport[4];
263         ALIGN(float fb_viewportcenter[4]);
264         ALIGN(float fb_viewportscale[4]);
265
266         float color[4];
267         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
268         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
269
270         int cullface;
271
272         const float *pointer_vertex3f;
273         const float *pointer_color4f;
274         const unsigned char *pointer_color4ub;
275         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
276         int stride_vertex;
277         int stride_color;
278         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
279         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
280         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
281
282         int numvertices;
283         int maxvertices;
284         float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
285         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
286         float *screencoord4f;
287
288         int shader_mode;
289         int shader_permutation;
290
291         int texture_max;
292         int texture_end;
293         int texture_firstfree;
294         DPSOFTRAST_Texture *texture;
295
296         int bigendian;
297
298         // error reporting
299         const char *errorstring;
300
301         int numthreads;
302         DPSOFTRAST_State_Thread *threads;
303 #ifdef USETHREADS
304         SDL_mutex *trianglemutex;
305         SDL_cond *trianglecond;
306 #endif
307
308         ATOMIC(int drawtriangle);
309
310         DPSOFTRAST_State_Command_Pool commandpool;
311         DPSOFTRAST_State_Triangle_Pool trianglepool;
312 }
313 DPSOFTRAST_State);
314
315 DPSOFTRAST_State dpsoftrast;
316
317 extern int dpsoftrast_test;
318
319 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
320 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
321 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
322 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
323 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
324
325 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
326 {
327         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
328         // and viewport projection values
329         int x1, x2;
330         int y1, y2;
331         x1 = thread->scissor[0];
332         x2 = thread->scissor[0] + thread->scissor[2];
333         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
334         y2 = dpsoftrast.fb_height - thread->scissor[1];
335         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
336         if (x1 < 0) x1 = 0;
337         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
338         if (y1 < 0) y1 = 0;
339         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
340         thread->fb_clearscissor[0] = x1;
341         thread->fb_clearscissor[1] = y1;
342         thread->fb_clearscissor[2] = x2 - x1;
343         thread->fb_clearscissor[3] = y2 - y1;
344 }
345
346 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
347 {
348         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
349 }
350
351 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
352 {
353         if (thread->blendsubtract)
354         {
355                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
356                 {
357                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
358                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
359                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
360                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
361                 }
362         }
363         else
364         {       
365             switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
366             {
367                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
368                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
369                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
370                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
371                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
372                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
373                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
374                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
375                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
376                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
377                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
378             }
379         }
380 }
381
382 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
383
384 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
385 {
386         mask &= thread->validate;
387         if (!mask)
388                 return;
389         if (mask & DPSOFTRAST_VALIDATE_FB)
390         {
391                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
392                 DPSOFTRAST_RecalcFB(thread);
393         }
394         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
395         {
396                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
397                 DPSOFTRAST_RecalcDepthFunc(thread);
398         }
399         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
400         {
401                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
402                 DPSOFTRAST_RecalcBlendFunc(thread);
403         }
404 }
405
406 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
407 {
408         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
409                 return &dpsoftrast.texture[index];
410         return NULL;
411 }
412
413 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
414 {
415         int w;
416         int h;
417         int d;
418         int size;
419         int s;
420         int texnum;
421         int mipmaps;
422         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
423         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
424         DPSOFTRAST_Texture *texture;
425         if (width*height*depth < 1)
426         {
427                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
428                 return 0;
429         }
430         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
431         {
432                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
433                 return 0;
434         }
435         switch(texformat)
436         {
437         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
438         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
439         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
440                 break;
441         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
442                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
443                 {
444                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
445                         return 0;
446                 }
447                 if (depth != 1)
448                 {
449                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
450                         return 0;
451                 }
452                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
453                 {
454                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
455                         return 0;
456                 }
457                 break;
458         }
459         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
460         {
461                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
462                 return 0;
463         }
464         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
465         {
466                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
467                 return 0;
468         }
469         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
470         {
471                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
472                 return 0;
473         }
474         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
477                 return 0;
478         }
479         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
482                 return 0;
483         }
484         DPSOFTRAST_Flush();
485         // find first empty slot in texture array
486         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
487                 if (!dpsoftrast.texture[texnum].bytes)
488                         break;
489         dpsoftrast.texture_firstfree = texnum + 1;
490         if (dpsoftrast.texture_max <= texnum)
491         {
492                 // expand texture array as needed
493                 if (dpsoftrast.texture_max < 1024)
494                         dpsoftrast.texture_max = 1024;
495                 else
496                         dpsoftrast.texture_max *= 2;
497                 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
498         }
499         if (dpsoftrast.texture_end <= texnum)
500                 dpsoftrast.texture_end = texnum + 1;
501         texture = &dpsoftrast.texture[texnum];
502         memset(texture, 0, sizeof(*texture));
503         texture->flags = flags;
504         texture->width = width;
505         texture->height = height;
506         texture->depth = depth;
507         texture->sides = sides;
508         w = width;
509         h = height;
510         d = depth;
511         size = 0;
512         mipmaps = 0;
513         w = width;
514         h = height;
515         d = depth;
516         for (;;)
517         {
518                 s = w * h * d * sides * 4;
519                 texture->mipmap[mipmaps][0] = size;
520                 texture->mipmap[mipmaps][1] = s;
521                 texture->mipmap[mipmaps][2] = w;
522                 texture->mipmap[mipmaps][3] = h;
523                 texture->mipmap[mipmaps][4] = d;
524                 size += s;
525                 mipmaps++;
526                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
527                         break;
528                 if (w > 1) w >>= 1;
529                 if (h > 1) h >>= 1;
530                 if (d > 1) d >>= 1;
531         }
532         texture->mipmaps = mipmaps;
533         texture->size = size;
534
535         // allocate the pixels now
536         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
537
538         return texnum;
539 }
540 void DPSOFTRAST_Texture_Free(int index)
541 {
542         DPSOFTRAST_Texture *texture;
543         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
544         DPSOFTRAST_Flush();
545         if (texture->bytes)
546                 MM_FREE(texture->bytes);
547         texture->bytes = NULL;
548         memset(texture, 0, sizeof(*texture));
549         // adjust the free range and used range
550         if (dpsoftrast.texture_firstfree > index)
551                 dpsoftrast.texture_firstfree = index;
552         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
553                 dpsoftrast.texture_end--;
554 }
555 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
556 {
557         int i, x, y, z, w, layer0, layer1, row0, row1;
558         unsigned char *o, *i0, *i1, *i2, *i3;
559         DPSOFTRAST_Texture *texture;
560         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
561         if (texture->mipmaps <= 1)
562                 return;
563         for (i = 1;i < texture->mipmaps;i++)
564         {
565                 for (z = 0;z < texture->mipmap[i][4];z++)
566                 {
567                         layer0 = z*2;
568                         layer1 = z*2+1;
569                         if (layer1 >= texture->mipmap[i-1][4])
570                                 layer1 = texture->mipmap[i-1][4]-1;
571                         for (y = 0;y < texture->mipmap[i][3];y++)
572                         {
573                                 row0 = y*2;
574                                 row1 = y*2+1;
575                                 if (row1 >= texture->mipmap[i-1][3])
576                                         row1 = texture->mipmap[i-1][3]-1;
577                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
578                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
579                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
580                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
581                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
582                                 w = texture->mipmap[i][2];
583                                 if (layer1 > layer0)
584                                 {
585                                         if (texture->mipmap[i-1][2] > 1)
586                                         {
587                                                 // average 3D texture
588                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
589                                                 {
590                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
591                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
592                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
593                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
594                                                 }
595                                         }
596                                         else
597                                         {
598                                                 // average 3D mipmap with parent width == 1
599                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
600                                                 {
601                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
602                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
603                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
604                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
605                                                 }
606                                         }
607                                 }
608                                 else
609                                 {
610                                         if (texture->mipmap[i-1][2] > 1)
611                                         {
612                                                 // average 2D texture (common case)
613                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
614                                                 {
615                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
616                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
617                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
618                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
619                                                 }
620                                         }
621                                         else
622                                         {
623                                                 // 2D texture with parent width == 1
624                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
625                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
626                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
627                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
628                                         }
629                                 }
630                         }
631                 }
632         }
633 }
634 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
635 {
636         DPSOFTRAST_Texture *texture;
637         unsigned char *dst;
638         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
639         DPSOFTRAST_Flush();
640         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
641         while (blockheight > 0)
642         {
643                 memcpy(dst, pixels, blockwidth * 4);
644                 pixels += blockwidth * 4;
645                 dst += texture->mipmap[0][2] * 4;
646                 blockheight--;
647         }
648         DPSOFTRAST_Texture_CalculateMipmaps(index);
649 }
650 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
651 {
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         DPSOFTRAST_Flush();
655         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
656         DPSOFTRAST_Texture_CalculateMipmaps(index);
657 }
658 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
659 {
660         DPSOFTRAST_Texture *texture;
661         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
662         return texture->mipmap[mip][2];
663 }
664 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
665 {
666         DPSOFTRAST_Texture *texture;
667         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
668         return texture->mipmap[mip][3];
669 }
670 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
671 {
672         DPSOFTRAST_Texture *texture;
673         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
674         return texture->mipmap[mip][4];
675 }
676 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
677 {
678         DPSOFTRAST_Texture *texture;
679         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
680         DPSOFTRAST_Flush();
681         return texture->bytes + texture->mipmap[mip][0];
682 }
683 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
684 {
685         DPSOFTRAST_Texture *texture;
686         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
687         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
688         {
689                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
690                 return;
691         }
692         DPSOFTRAST_Flush();
693         texture->filter = filter;
694 }
695
696 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
697 {
698         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
699                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
700                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
701                 DPSOFTRAST_Flush();
702         dpsoftrast.fb_width = width;
703         dpsoftrast.fb_height = height;
704         dpsoftrast.fb_depthpixels = depthpixels;
705         dpsoftrast.fb_colorpixels[0] = colorpixels0;
706         dpsoftrast.fb_colorpixels[1] = colorpixels1;
707         dpsoftrast.fb_colorpixels[2] = colorpixels2;
708         dpsoftrast.fb_colorpixels[3] = colorpixels3;
709 }
710
711 void DPSOFTRAST_Draw_FlushThreads(void);
712
713 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
714 {
715         DPSOFTRAST_State_Thread *thread;
716         int i;
717         int freetriangle = dpsoftrast.trianglepool.freetriangle;
718         int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
719         if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
720             return;
721 #ifdef USETHREADS
722         SDL_LockMutex(dpsoftrast.trianglemutex);
723 #endif
724         for(;;)
725         {
726             int waitindex = -1;
727             int triangleoffset;
728             usedtriangles = 0;
729             for (i = 0; i < dpsoftrast.numthreads; i++)
730             {
731                 thread = &dpsoftrast.threads[i];
732                 triangleoffset = freetriangle - thread->triangleoffset;
733                 if (triangleoffset < 0)
734                     triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
735                 if (triangleoffset > usedtriangles)
736                 {
737                     waitindex = i;
738                     usedtriangles = triangleoffset;
739                 }
740             }
741             if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
742                 break;
743 #ifdef USETHREADS
744             thread = &dpsoftrast.threads[waitindex];
745             thread->waiting = true;
746             SDL_CondBroadcast(dpsoftrast.trianglecond);
747             SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
748             thread->waiting = false;
749 #endif
750         }
751 #ifdef USETHREADS
752         SDL_UnlockMutex(dpsoftrast.trianglemutex);
753 #endif
754         dpsoftrast.trianglepool.usedtriangles = usedtriangles;
755 }
756
757 void DPSOFTRAST_Draw_SyncCommands(void)
758 {
759         DPSOFTRAST_State_Triangle *triangle;
760         if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
761 #ifdef USETHREADS
762             DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
763 #else
764             DPSOFTRAST_Draw_FlushThreads();
765 #endif
766         triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
767         triangle->commandoffset = dpsoftrast.commandpool.freecommand;
768         triangle->starty = -1;
769         triangle->endy = -1;
770         dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
771         dpsoftrast.trianglepool.usedtriangles++;
772         MEMORY_BARRIER;
773         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
774 }
775
776 void DPSOFTRAST_Draw_FreeCommandPool(int space)
777 {
778         DPSOFTRAST_State_Thread *thread;
779         int i;
780         int freecommand = dpsoftrast.commandpool.freecommand;
781         int usedcommands = dpsoftrast.commandpool.usedcommands;
782         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
783                 return;
784         DPSOFTRAST_Draw_SyncCommands();
785 #ifdef USETHREADS
786         SDL_LockMutex(dpsoftrast.trianglemutex);
787 #endif
788         for(;;)
789         {
790                 int waitindex = -1;
791                 int commandoffset;
792                 usedcommands = 0;
793                 for (i = 0; i < dpsoftrast.numthreads; i++)
794                 {
795                         thread = &dpsoftrast.threads[i]; 
796                         commandoffset = freecommand - thread->commandoffset;
797                         if (commandoffset < 0)
798                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
799                         if (commandoffset > usedcommands)
800                         {
801                                 waitindex = i;
802                                 usedcommands = commandoffset;
803                         }
804                 }
805                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
806                         break;
807 #ifdef USETHREADS
808                 thread = &dpsoftrast.threads[waitindex];
809                 thread->waiting = true;
810                 SDL_CondBroadcast(dpsoftrast.trianglecond);
811                 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
812                 thread->waiting = false;
813 #endif
814         }
815 #ifdef USETHREADS
816         SDL_UnlockMutex(dpsoftrast.trianglemutex);
817 #endif
818         dpsoftrast.commandpool.usedcommands = usedcommands;
819 }
820
821 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
822         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
823
824 static void *DPSOFTRAST_AllocateCommand(int size)
825 {
826         DPSOFTRAST_Command *command;
827         int freecommand = dpsoftrast.commandpool.freecommand;
828         int usedcommands = dpsoftrast.commandpool.usedcommands;
829         int extra = sizeof(DPSOFTRAST_Command);
830         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
832         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833         {
834 #ifdef USETHREADS
835                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
836 #else
837                 DPSOFTRAST_Draw_FlushThreads();
838 #endif
839                 freecommand = dpsoftrast.commandpool.freecommand;
840                 usedcommands = dpsoftrast.commandpool.usedcommands;
841         }
842         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
843         {
844                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845                 command->opcode = DPSOFTRAST_OPCODE_Reset;
846                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
847                 freecommand = 0;
848         }
849         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850         freecommand += size;
851         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
852                 freecommand = 0;
853
854         dpsoftrast.commandpool.freecommand = freecommand;
855         dpsoftrast.commandpool.usedcommands = usedcommands + size;
856         return command;
857 }
858         
859 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
860 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
861 {
862         thread->viewport[0] = command->x;
863         thread->viewport[1] = command->y;
864         thread->viewport[2] = command->width;
865         thread->viewport[3] = command->height;
866         thread->validate |= DPSOFTRAST_VALIDATE_FB;
867 }
868 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
869 {
870         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
871         command->opcode = DPSOFTRAST_OPCODE_Viewport;
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
882         dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
883         dpsoftrast.fb_viewportcenter[3] = 0.5f;
884         dpsoftrast.fb_viewportcenter[0] = 0.0f;
885         dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
886         dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
887         dpsoftrast.fb_viewportscale[3] = 0.5f;
888         dpsoftrast.fb_viewportscale[0] = 1.0f;
889 }
890
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
893 {
894         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
895         unsigned int *p;
896         unsigned int c;
897         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
898         x1 = thread->fb_clearscissor[0];
899         y1 = thread->fb_clearscissor[1];
900         x2 = thread->fb_clearscissor[2];
901         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
902         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
903         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
904         if(y1 < t1) y1 = t1;
905         if(y2 > t2) y2 = t2;
906         w = x2 - x1;
907         h = y2 - y1;
908         if (w < 1 || h < 1)
909                 return;
910         // FIXME: honor fb_colormask?
911         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
912         for (i = 0;i < 4;i++)
913         {
914                 if (!dpsoftrast.fb_colorpixels[i])
915                         continue;
916                 for (y = y1;y < y2;y++)
917                 {
918                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
919                         for (x = x1;x < x2;x++)
920                                 p[x] = c;
921                 }
922         }
923 }
924 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
925 {
926         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
927         command->opcode = DPSOFTRAST_OPCODE_ClearColor;
928         command->r = r;
929         command->g = g;
930         command->b = b;
931         command->a = a;
932 }
933
934 DEFCOMMAND(3, ClearDepth, float depth;)
935 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
936 {
937         int x1, y1, x2, y2, w, h, x, y, t1, t2;
938         unsigned int *p;
939         unsigned int c;
940         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
941         x1 = thread->fb_clearscissor[0];
942         y1 = thread->fb_clearscissor[1];
943         x2 = thread->fb_clearscissor[2];
944         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
945         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
946         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
947         if(y1 < t1) y1 = t1;
948         if(y2 > t2) y2 = t2;
949         w = x2 - x1;
950         h = y2 - y1;
951         if (w < 1 || h < 1)
952                 return;
953         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
954         for (y = y1;y < y2;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
965         command->depth = d;
966 }
967
968 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
969 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
970 {
971         thread->colormask[0] = command->r != 0;
972         thread->colormask[1] = command->g != 0;
973         thread->colormask[2] = command->b != 0;
974         thread->colormask[3] = command->a != 0;
975         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
976 }
977 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
978 {
979         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
980         command->opcode = DPSOFTRAST_OPCODE_ColorMask;
981         command->r = r;
982         command->g = g;
983         command->b = b;
984         command->a = a;
985 }
986
987 DEFCOMMAND(5, DepthTest, int enable;)
988 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
989 {
990         thread->depthtest = command->enable;
991         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
992 }
993 void DPSOFTRAST_DepthTest(int enable)
994 {
995         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
996         command->opcode = DPSOFTRAST_OPCODE_DepthTest;
997         command->enable = enable;
998 }
999
1000 DEFCOMMAND(6, ScissorTest, int enable;)
1001 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1002 {
1003         thread->scissortest = command->enable;
1004         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1005 }
1006 void DPSOFTRAST_ScissorTest(int enable)
1007 {
1008         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1009         command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1010         command->enable = enable;
1011 }
1012
1013 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1014 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1015 {
1016         thread->scissor[0] = command->x;
1017         thread->scissor[1] = command->y;
1018         thread->scissor[2] = command->width;
1019         thread->scissor[3] = command->height;
1020         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1021 }
1022 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1023 {
1024         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1025         command->opcode = DPSOFTRAST_OPCODE_Scissor;
1026         command->x = x;
1027         command->y = y;
1028         command->width = width;
1029         command->height = height;
1030 }
1031
1032 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1033 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1034 {
1035         thread->blendfunc[0] = command->sfactor;
1036         thread->blendfunc[1] = command->dfactor;
1037         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1038 }
1039 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1040 {
1041         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1042         command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1043         command->sfactor = sfactor;
1044         command->dfactor = dfactor;
1045 }
1046
1047 DEFCOMMAND(9, BlendSubtract, int enable;)
1048 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1049 {
1050         thread->blendsubtract = command->enable;
1051         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1052 }
1053 void DPSOFTRAST_BlendSubtract(int enable)
1054 {
1055         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1056         command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1057         command->enable = enable;
1058 }
1059
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1062 {
1063         thread->depthmask = command->enable;
1064 }
1065 void DPSOFTRAST_DepthMask(int enable)
1066 {
1067         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068         command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1069         command->enable = enable;
1070 }
1071
1072 DEFCOMMAND(11, DepthFunc, int func;)
1073 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1074 {
1075         thread->depthfunc = command->func;
1076 }
1077 void DPSOFTRAST_DepthFunc(int func)
1078 {
1079         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1080         command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1081         command->func = func;
1082 }
1083
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1086 {
1087         thread->depthrange[0] = command->nearval;
1088         thread->depthrange[1] = command->farval;
1089 }
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1091 {
1092         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093         command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1094         command->nearval = nearval;
1095         command->farval = farval;
1096 }
1097
1098 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1099 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1100 {
1101         thread->polygonoffset[0] = command->alongnormal;
1102         thread->polygonoffset[1] = command->intoview;
1103 }
1104 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1105 {
1106         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1107         command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1108         command->alongnormal = alongnormal;
1109         command->intoview = intoview;
1110 }
1111
1112 void DPSOFTRAST_CullFace(int mode)
1113 {
1114         dpsoftrast.cullface = mode;
1115 }
1116
1117 DEFCOMMAND(15, AlphaTest, int enable;)
1118 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1119 {
1120         thread->alphatest = command->enable;
1121 }
1122 void DPSOFTRAST_AlphaTest(int enable)
1123 {
1124         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1125         command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1126         command->enable = enable;
1127 }
1128
1129 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1130 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1131 {
1132         thread->alphafunc = command->func;
1133         thread->alphavalue = command->ref;
1134 }
1135 void DPSOFTRAST_AlphaFunc(int func, float ref)
1136 {
1137         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1138         command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1139         command->func = func;
1140         command->ref = ref;
1141 }
1142
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1144 {
1145         dpsoftrast.color[0] = r;
1146         dpsoftrast.color[1] = g;
1147         dpsoftrast.color[2] = b;
1148         dpsoftrast.color[3] = a;
1149 }
1150
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1152 {
1153         int outstride = blockwidth * 4;
1154         int instride = dpsoftrast.fb_width * 4;
1155         int bx1 = blockx;
1156         int by1 = blocky;
1157         int bx2 = blockx + blockwidth;
1158         int by2 = blocky + blockheight;
1159         int bw;
1160         int bh;
1161         int x;
1162         int y;
1163         unsigned char *inpixels;
1164         unsigned char *b;
1165         unsigned char *o;
1166         DPSOFTRAST_Flush();
1167         if (bx1 < 0) bx1 = 0;
1168         if (by1 < 0) by1 = 0;
1169         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1171         bw = bx2 - bx1;
1172         bh = by2 - by1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         DPSOFTRAST_Flush();
1227         spixels = dpsoftrast.fb_colorpixels[0];
1228         swidth = dpsoftrast.fb_width;
1229         sheight = dpsoftrast.fb_height;
1230         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231         twidth = texture->mipmap[mip][2];
1232         theight = texture->mipmap[mip][3];
1233         if (tx1 < 0) tx1 = 0;
1234         if (ty1 < 0) ty1 = 0;
1235         if (tx2 > twidth) tx2 = twidth;
1236         if (ty2 > theight) ty2 = theight;
1237         if (sx1 < 0) sx1 = 0;
1238         if (sy1 < 0) sy1 = 0;
1239         if (sx2 > swidth) sx2 = swidth;
1240         if (sy2 > sheight) sy2 = sheight;
1241         tw = tx2 - tx1;
1242         th = ty2 - ty1;
1243         sw = sx2 - sx1;
1244         sh = sy2 - sy1;
1245         if (tw > sw) tw = sw;
1246         if (th > sh) th = sh;
1247         if (tw < 1 || th < 1)
1248                 return;
1249         for (y = 0;y < th;y++)
1250                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1251         if (texture->mipmaps > 1)
1252                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1253 }
1254
1255 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1256 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1257 {
1258         thread->texbound[command->unitnum] = command->texture;
1259 }
1260 void DPSOFTRAST_SetTexture(int unitnum, int index)
1261 {
1262         DPSOFTRAST_Command_SetTexture *command;
1263         DPSOFTRAST_Texture *texture;
1264         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1265         {
1266                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1267                 return;
1268         }
1269         texture = DPSOFTRAST_Texture_GetByIndex(index);
1270         if (index && !texture)
1271         {
1272                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1273                 return;
1274         }
1275
1276         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1277         command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1278         command->unitnum = unitnum;
1279         command->texture = texture;
1280
1281         dpsoftrast.texbound[unitnum] = texture;
1282 }
1283
1284 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1285 {
1286         dpsoftrast.pointer_vertex3f = vertex3f;
1287         dpsoftrast.stride_vertex = stride;
1288 }
1289 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1290 {
1291         dpsoftrast.pointer_color4f = color4f;
1292         dpsoftrast.pointer_color4ub = NULL;
1293         dpsoftrast.stride_color = stride;
1294 }
1295 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1296 {
1297         dpsoftrast.pointer_color4f = NULL;
1298         dpsoftrast.pointer_color4ub = color4ub;
1299         dpsoftrast.stride_color = stride;
1300 }
1301 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1302 {
1303         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1304         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1305         dpsoftrast.stride_texcoord[unitnum] = stride;
1306 }
1307
1308 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1309 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1310 {
1311         thread->shader_mode = command->mode;
1312         thread->shader_permutation = command->permutation;
1313 }
1314 void DPSOFTRAST_SetShader(int mode, int permutation)
1315 {
1316         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1317         command->opcode = DPSOFTRAST_OPCODE_SetShader;
1318         command->mode = mode;
1319         command->permutation = permutation;
1320
1321         dpsoftrast.shader_mode = mode;
1322         dpsoftrast.shader_permutation = permutation;
1323 }
1324
1325 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1326 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1327 {
1328         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1329 }
1330 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1331 {
1332         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1333         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1334         command->index = index;
1335         command->val[0] = v0;
1336         command->val[1] = v1;
1337         command->val[2] = v2;
1338         command->val[3] = v3;
1339
1340         dpsoftrast.uniform4f[index*4+0] = v0;
1341         dpsoftrast.uniform4f[index*4+1] = v1;
1342         dpsoftrast.uniform4f[index*4+2] = v2;
1343         dpsoftrast.uniform4f[index*4+3] = v3;
1344 }
1345 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1346 {
1347         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1369                 command->index = index;
1370                 if (((size_t)v)&(ALIGN_SIZE-1))
1371                 {
1372                         m0 = _mm_loadu_ps(v);
1373                         m1 = _mm_loadu_ps(v+4);
1374                         m2 = _mm_loadu_ps(v+8);
1375                         m3 = _mm_loadu_ps(v+12);
1376                 }
1377                 else
1378                 {
1379                         m0 = _mm_load_ps(v);
1380                         m1 = _mm_load_ps(v+4);
1381                         m2 = _mm_load_ps(v+8);
1382                         m3 = _mm_load_ps(v+12);
1383                 }
1384                 if (transpose)
1385                 {
1386                         __m128 t0, t1, t2, t3;
1387                         t0 = _mm_unpacklo_ps(m0, m1);
1388                         t1 = _mm_unpacklo_ps(m2, m3);
1389                         t2 = _mm_unpackhi_ps(m0, m1);
1390                         t3 = _mm_unpackhi_ps(m2, m3);
1391                         m0 = _mm_movelh_ps(t0, t1);
1392                         m1 = _mm_movehl_ps(t1, t0);
1393                         m2 = _mm_movelh_ps(t2, t3);
1394                         m3 = _mm_movehl_ps(t3, t2);                     
1395                 }
1396                 _mm_store_ps(command->val, m0);
1397                 _mm_store_ps(command->val+4, m1);
1398                 _mm_store_ps(command->val+8, m2);
1399                 _mm_store_ps(command->val+12, m3);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1403                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1404         }
1405 #endif
1406 }
1407
1408 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1409 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 {
1411         thread->uniform1i[command->index] = command->val;
1412 }
1413 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 {
1415         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1416         command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1417         command->index = index;
1418         command->val = i0;
1419
1420         dpsoftrast.uniform1i[command->index] = i0;
1421 }
1422
1423 #ifdef SSE2_PRESENT
1424 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1425 {
1426         float *end = dst + size*4;
1427         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1428         {
1429                 while (dst < end)
1430                 {
1431                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1432                         dst += 4;
1433                         src += stride;
1434                 }
1435         }
1436         else
1437         {
1438                 while (dst < end)
1439                 {
1440                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1441                         dst += 4;
1442                         src += stride;
1443                 }
1444         }
1445 }
1446
1447 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1448 {
1449         float *end = dst + size*4;
1450         if (stride == sizeof(float[3]))
1451         {
1452                 float *end4 = dst + (size&~3)*4;        
1453                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1454                 {
1455                         while (dst < end4)
1456                         {
1457                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1458                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1459                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1460                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1462                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1465                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1466                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1469                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470                                 dst += 16;
1471                                 src += 4*sizeof(float[3]);
1472                         }
1473                 }
1474                 else
1475                 {
1476                         while (dst < end4)
1477                         {
1478                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1479                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1480                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1481                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1483                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1486                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1487                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1490                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1491                                 dst += 16;
1492                                 src += 4*sizeof(float[3]);
1493                         }
1494                 }
1495         }
1496         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1497         {
1498                 while (dst < end)
1499                 {
1500                         __m128 v = _mm_loadu_ps((const float *)src);
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1502                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1503                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1504                         _mm_store_ps(dst, v);
1505                         dst += 4;
1506                         src += stride;
1507                 }
1508         }
1509         else
1510         {
1511                 while (dst < end)
1512                 {
1513                         __m128 v = _mm_load_ps((const float *)src);
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1515                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1516                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1517                         _mm_store_ps(dst, v);
1518                         dst += 4;
1519                         src += stride;
1520                 }
1521         }
1522 }
1523
1524 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1525 {
1526         float *end = dst + size*4;
1527         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1528         if (stride == sizeof(float[2]))
1529         {
1530                 float *end2 = dst + (size&~1)*4;
1531                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1532                 {
1533                         while (dst < end2)
1534                         {
1535                                 __m128 v = _mm_loadu_ps((const float *)src);
1536                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1537                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1538                                 dst += 8;
1539                                 src += 2*sizeof(float[2]);
1540                         }
1541                 }
1542                 else
1543                 {
1544                         while (dst < end2)
1545                         {
1546                                 __m128 v = _mm_load_ps((const float *)src);
1547                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1549                                 dst += 8;
1550                                 src += 2*sizeof(float[2]);
1551                         }
1552                 }
1553         }
1554         while (dst < end)
1555         {
1556                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1557                 dst += 4;
1558                 src += stride;
1559         }
1560 }
1561
1562 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1563 {
1564         float *end = dst + size*4;
1565         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1566         if (stride == sizeof(unsigned char[4]))
1567         {
1568                 float *end4 = dst + (size&~3)*4;
1569                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1570                 {
1571                         while (dst < end4)
1572                         {
1573                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1574                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1576                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1577                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1578                                 dst += 16;
1579                                 src += 4*sizeof(unsigned char[4]);
1580                         }
1581                 }
1582                 else
1583                 {
1584                 while (dst < end4)
1585                 {
1586                     __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1587                     _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1588                     _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1589                     _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1590                     _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1591                     dst += 16;
1592                     src += 4*sizeof(unsigned char[4]);
1593                 }
1594                 }
1595         }
1596         while (dst < end)
1597         {
1598                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1599                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1600                 dst += 4;
1601                 src += stride;
1602         }
1603 }
1604
1605 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1606 {
1607         float *end = dst + 4*size;
1608         __m128 v = _mm_loadu_ps(src);
1609         while (dst < end)
1610         {
1611                 _mm_store_ps(dst, v);
1612                 dst += 4;
1613         }
1614 }
1615 #endif
1616
1617 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1618 {
1619 #ifdef SSE2_PRESENT
1620         int i;
1621         int j;
1622         int stride;
1623         const float *v;
1624         float *p;
1625         float *data;
1626         const unsigned char *b;
1627         dpsoftrast.numvertices = numvertices;
1628         if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1629         {
1630                 if (dpsoftrast.maxvertices < 4096)
1631                         dpsoftrast.maxvertices = 4096;
1632                 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1633                         dpsoftrast.maxvertices *= 2;
1634                 if (dpsoftrast.in_array4f[0])
1635                         MM_FREE(dpsoftrast.in_array4f[0]);
1636                 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1637                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1638                         dpsoftrast.in_array4f[i] = data;
1639                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1640                         dpsoftrast.post_array4f[i] = data;
1641                 dpsoftrast.screencoord4f = data;
1642                 data += dpsoftrast.maxvertices * 4;
1643         }
1644         stride = dpsoftrast.stride_vertex;
1645         v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1646         p = dpsoftrast.in_array4f[0];
1647         DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1648         if (needcolors)
1649         {
1650                 if (dpsoftrast.pointer_color4f)
1651                 {
1652                         stride = dpsoftrast.stride_color;
1653                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1654                         p = dpsoftrast.in_array4f[1];
1655                         DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1656                 }
1657                 else if (dpsoftrast.pointer_color4ub)
1658                 {
1659                         stride = dpsoftrast.stride_color;
1660                         b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1661                         p = dpsoftrast.in_array4f[1];
1662                         DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1663                 }
1664                 else
1665                 {
1666                         p = dpsoftrast.in_array4f[1];
1667                         DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1668                 }
1669         }
1670         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1671         {
1672                 if (dpsoftrast.pointer_texcoordf[j])
1673                 {
1674                         stride = dpsoftrast.stride_texcoord[j];
1675                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1676                         p = dpsoftrast.in_array4f[j+2];
1677                         switch(dpsoftrast.components_texcoord[j])
1678                         {
1679                         case 2:
1680                                 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1681                                 break;
1682                         case 3:
1683                                 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1684                                 break;
1685                         case 4:
1686                                 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1687                                 break;
1688                         }
1689                 }
1690         }
1691 #endif
1692 }
1693
1694 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1695 {
1696 #ifdef SSE2_PRESENT
1697         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1698         __m128 m0, m1, m2, m3;
1699         float *end = out4f + numitems*4;
1700         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1701         {
1702                 // fast case for identity matrix
1703                 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1704                 return;
1705         }
1706         m0 = _mm_loadu_ps(inmatrix16f);
1707         m1 = _mm_loadu_ps(inmatrix16f + 4);
1708         m2 = _mm_loadu_ps(inmatrix16f + 8);
1709         m3 = _mm_loadu_ps(inmatrix16f + 12);
1710         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1711         {
1712                 while (out4f < end)
1713                 {
1714                         __m128 v = _mm_loadu_ps(in4f);
1715                         _mm_store_ps(out4f,
1716                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1717                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1718                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1719                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1720                         out4f += 4;
1721                         in4f += 4;
1722                 }
1723         }
1724         else
1725         {
1726                 while (out4f < end)
1727                 {
1728                         __m128 v = _mm_load_ps(in4f);
1729                         _mm_store_ps(out4f,
1730                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1731                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1732                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1733                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1734                         out4f += 4;
1735                         in4f += 4;
1736                 }
1737         }
1738 #endif
1739 }
1740
1741 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1742 {
1743         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1744 }
1745
1746 #ifdef SSE2_PRESENT
1747 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1748 {
1749         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1750         __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1751         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1752         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1753         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1754         return v;
1755 }
1756 #endif
1757
1758 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1759 {
1760 #ifdef SSE2_PRESENT
1761         float *end = out4f + numitems*4;
1762         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1763         while (out4f < end)
1764         {
1765                 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1766                 _mm_store_ps(out4f, v);
1767                 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1768                 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1769                 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1770                 in4f += 4;
1771                 out4f += 4;
1772                 screen4f += 4;
1773         }
1774 #endif
1775 }
1776
1777 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1778 {
1779 #ifdef SSE2_PRESENT
1780         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1781         __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1782         float *end = out4f + numitems*4;
1783         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1784         {
1785                 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1786                 return;
1787         }
1788         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1789         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790         m0 = _mm_loadu_ps(inmatrix16f);
1791         m1 = _mm_loadu_ps(inmatrix16f + 4);
1792         m2 = _mm_loadu_ps(inmatrix16f + 8);
1793         m3 = _mm_loadu_ps(inmatrix16f + 12);
1794         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1795         {
1796                 while (out4f < end)
1797                 {
1798                         __m128 v = _mm_loadu_ps(in4f), w;
1799                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1800                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1801                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1802                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1803                         _mm_store_ps(out4f, v);
1804                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1805                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1806                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1807                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1808                         in4f += 4;
1809                         out4f += 4;
1810                         screen4f += 4;
1811                 }
1812         }
1813         else
1814         {
1815                 while (out4f < end)
1816                 {
1817                         __m128 v = _mm_load_ps(in4f), w;
1818                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1819                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1820                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1821                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1822                         _mm_store_ps(out4f, v);
1823                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1824                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1825                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1826                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832 #endif
1833 }
1834
1835 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1836 {
1837         int x;
1838         int startx = span->startx;
1839         int endx = span->endx;
1840         float wslope = triangle->w[0];
1841         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1842         float endz = 1.0f / (w + wslope * startx);
1843         for (x = startx;x < endx;)
1844         {
1845                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1846                 float z = endz, dz;
1847                 if(nextsub >= endx) nextsub = endsub = endx-1;
1848                 endz = 1.0f / (w + wslope * nextsub);
1849                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1850                 for (; x <= endsub; x++, z += dz)
1851                         zf[x] = z;
1852         }
1853 }
1854
1855 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1856 {
1857         int x;
1858         int startx = span->startx;
1859         int endx = span->endx;
1860         int d[4];
1861         float a, b;
1862         unsigned char * RESTRICT pixelmask = span->pixelmask;
1863         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1864         if (!pixel)
1865                 return;
1866         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1867         // handle alphatest now (this affects depth writes too)
1868         if (thread->alphatest)
1869                 for (x = startx;x < endx;x++)
1870                         if (in4f[x*4+3] < 0.5f)
1871                                 pixelmask[x] = false;
1872         // FIXME: this does not handle bigendian
1873         switch(thread->fb_blendmode)
1874         {
1875         case DPSOFTRAST_BLENDMODE_OPAQUE:
1876                 for (x = startx;x < endx;x++)
1877                 {
1878                         if (!pixelmask[x])
1879                                 continue;
1880                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1881                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1882                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1883                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1884                         pixel[x*4+0] = d[0];
1885                         pixel[x*4+1] = d[1];
1886                         pixel[x*4+2] = d[2];
1887                         pixel[x*4+3] = d[3];
1888                 }
1889                 break;
1890         case DPSOFTRAST_BLENDMODE_ALPHA:
1891                 for (x = startx;x < endx;x++)
1892                 {
1893                         if (!pixelmask[x])
1894                                 continue;
1895                         a = in4f[x*4+3] * 255.0f;
1896                         b = 1.0f - in4f[x*4+3];
1897                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1898                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1899                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1900                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1901                         pixel[x*4+0] = d[0];
1902                         pixel[x*4+1] = d[1];
1903                         pixel[x*4+2] = d[2];
1904                         pixel[x*4+3] = d[3];
1905                 }
1906                 break;
1907         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1908                 for (x = startx;x < endx;x++)
1909                 {
1910                         if (!pixelmask[x])
1911                                 continue;
1912                         a = in4f[x*4+3] * 255.0f;
1913                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1914                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1915                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1916                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1917                         pixel[x*4+0] = d[0];
1918                         pixel[x*4+1] = d[1];
1919                         pixel[x*4+2] = d[2];
1920                         pixel[x*4+3] = d[3];
1921                 }
1922                 break;
1923         case DPSOFTRAST_BLENDMODE_ADD:
1924                 for (x = startx;x < endx;x++)
1925                 {
1926                         if (!pixelmask[x])
1927                                 continue;
1928                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1929                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1930                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1931                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1932                         pixel[x*4+0] = d[0];
1933                         pixel[x*4+1] = d[1];
1934                         pixel[x*4+2] = d[2];
1935                         pixel[x*4+3] = d[3];
1936                 }
1937                 break;
1938         case DPSOFTRAST_BLENDMODE_INVMOD:
1939                 for (x = startx;x < endx;x++)
1940                 {
1941                         if (!pixelmask[x])
1942                                 continue;
1943                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1944                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1945                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1946                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1947                         pixel[x*4+0] = d[0];
1948                         pixel[x*4+1] = d[1];
1949                         pixel[x*4+2] = d[2];
1950                         pixel[x*4+3] = d[3];
1951                 }
1952                 break;
1953         case DPSOFTRAST_BLENDMODE_MUL:
1954                 for (x = startx;x < endx;x++)
1955                 {
1956                         if (!pixelmask[x])
1957                                 continue;
1958                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1959                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1960                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1961                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1962                         pixel[x*4+0] = d[0];
1963                         pixel[x*4+1] = d[1];
1964                         pixel[x*4+2] = d[2];
1965                         pixel[x*4+3] = d[3];
1966                 }
1967                 break;
1968         case DPSOFTRAST_BLENDMODE_MUL2:
1969                 for (x = startx;x < endx;x++)
1970                 {
1971                         if (!pixelmask[x])
1972                                 continue;
1973                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
1974                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
1975                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
1976                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
1977                         pixel[x*4+0] = d[0];
1978                         pixel[x*4+1] = d[1];
1979                         pixel[x*4+2] = d[2];
1980                         pixel[x*4+3] = d[3];
1981                 }
1982                 break;
1983         case DPSOFTRAST_BLENDMODE_SUBALPHA:
1984                 for (x = startx;x < endx;x++)
1985                 {
1986                         if (!pixelmask[x])
1987                                 continue;
1988                         a = in4f[x*4+3] * -255.0f;
1989                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
1990                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
1991                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
1992                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
1993                         pixel[x*4+0] = d[0];
1994                         pixel[x*4+1] = d[1];
1995                         pixel[x*4+2] = d[2];
1996                         pixel[x*4+3] = d[3];
1997                 }
1998                 break;
1999         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2000                 for (x = startx;x < endx;x++)
2001                 {
2002                         if (!pixelmask[x])
2003                                 continue;
2004                         a = 255.0f;
2005                         b = 1.0f - in4f[x*4+3];
2006                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2007                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2008                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2009                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2010                         pixel[x*4+0] = d[0];
2011                         pixel[x*4+1] = d[1];
2012                         pixel[x*4+2] = d[2];
2013                         pixel[x*4+3] = d[3];
2014                 }
2015                 break;
2016         }
2017 }
2018
2019 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2020 {
2021 #ifdef SSE2_PRESENT
2022         int x;
2023         int startx = span->startx;
2024         int endx = span->endx;
2025         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2026         unsigned char * RESTRICT pixelmask = span->pixelmask;
2027         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2028         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2029         if (!pixel)
2030                 return;
2031         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2032         pixeli += span->y * dpsoftrast.fb_width + span->x;
2033         // handle alphatest now (this affects depth writes too)
2034         if (thread->alphatest)
2035                 for (x = startx;x < endx;x++)
2036                         if (in4ub[x*4+3] < 0.5f)
2037                                 pixelmask[x] = false;
2038         // FIXME: this does not handle bigendian
2039         switch(thread->fb_blendmode)
2040         {
2041         case DPSOFTRAST_BLENDMODE_OPAQUE:
2042                 for (x = startx;x + 4 <= endx;)
2043                 {
2044                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2045                         {
2046                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2047                                 x += 4;
2048                         }
2049                         else
2050                         {
2051                                 if (pixelmask[x])
2052                                         pixeli[x] = ini[x];
2053                                 x++;
2054                         }
2055                 }
2056                 for (;x < endx;x++)
2057                         if (pixelmask[x])
2058                                 pixeli[x] = ini[x];
2059                 break;
2060         case DPSOFTRAST_BLENDMODE_ALPHA:
2061         #define FINISHBLEND(blend2, blend1) \
2062                 for (x = startx;x + 2 <= endx;x += 2) \
2063                 { \
2064                         __m128i src, dst; \
2065                         switch (*(const unsigned short*)&pixelmask[x]) \
2066                         { \
2067                         case 0x0101: \
2068                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2069                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2070                                 blend2; \
2071                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2072                                 continue; \
2073                         case 0x0100: \
2074                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2075                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2076                                 blend1; \
2077                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2078                                 continue; \
2079                         case 0x0001: \
2080                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2081                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2082                                 blend1; \
2083                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2084                                 continue; \
2085                         } \
2086                         break; \
2087                 } \
2088                 for(;x < endx; x++) \
2089                 { \
2090                         __m128i src, dst; \
2091                         if (!pixelmask[x]) \
2092                                 continue; \
2093                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2094                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2095                         blend1; \
2096                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2097                 }
2098
2099                 FINISHBLEND({
2100                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2101                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2102                 }, {
2103                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2104                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2105                 });
2106                 break;
2107         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2108                 FINISHBLEND({
2109                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2110                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2111                 }, {
2112                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2113                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2114                 });
2115                 break;
2116         case DPSOFTRAST_BLENDMODE_ADD:
2117                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_INVMOD:
2120                 FINISHBLEND({
2121                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2122                 }, {
2123                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2124                 });
2125                 break;
2126         case DPSOFTRAST_BLENDMODE_MUL:
2127                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2128                 break;
2129         case DPSOFTRAST_BLENDMODE_MUL2:
2130                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2131                 break;
2132         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133                 FINISHBLEND({
2134                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2135                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2136                 }, {
2137                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2138                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2139                 });
2140                 break;
2141         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2142                 FINISHBLEND({
2143                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2144                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2145                 }, {
2146                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2147                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2148                 });
2149                 break;
2150         }
2151 #endif
2152 }
2153
2154 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2155 {
2156         int x;
2157         int startx = span->startx;
2158         int endx = span->endx;
2159         int flags;
2160         float c[4];
2161         float data[4];
2162         float slope[4];
2163         float tc[2], endtc[2];
2164         float tcscale[2];
2165         unsigned int tci[2];
2166         unsigned int tci1[2];
2167         unsigned int tcimin[2];
2168         unsigned int tcimax[2];
2169         int tciwrapmask[2];
2170         int tciwidth;
2171         int filter;
2172         int mip;
2173         const unsigned char * RESTRICT pixelbase;
2174         const unsigned char * RESTRICT pixel[4];
2175         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2176         // if no texture is bound, just fill it with white
2177         if (!texture)
2178         {
2179                 for (x = startx;x < endx;x++)
2180                 {
2181                         out4f[x*4+0] = 1.0f;
2182                         out4f[x*4+1] = 1.0f;
2183                         out4f[x*4+2] = 1.0f;
2184                         out4f[x*4+3] = 1.0f;
2185                 }
2186                 return;
2187         }
2188         mip = triangle->mip[texunitindex];
2189         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2190         // if this mipmap of the texture is 1 pixel, just fill it with that color
2191         if (texture->mipmap[mip][1] == 4)
2192         {
2193                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2194                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2195                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2196                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2197                 for (x = startx;x < endx;x++)
2198                 {
2199                         out4f[x*4+0] = c[0];
2200                         out4f[x*4+1] = c[1];
2201                         out4f[x*4+2] = c[2];
2202                         out4f[x*4+3] = c[3];
2203                 }
2204                 return;
2205         }
2206         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2207         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2208         flags = texture->flags;
2209         tcscale[0] = texture->mipmap[mip][2];
2210         tcscale[1] = texture->mipmap[mip][3];
2211         tciwidth = texture->mipmap[mip][2];
2212         tcimin[0] = 0;
2213         tcimin[1] = 0;
2214         tcimax[0] = texture->mipmap[mip][2]-1;
2215         tcimax[1] = texture->mipmap[mip][3]-1;
2216         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2217         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2218         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2219         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2220         for (x = startx;x < endx;)
2221         {
2222                 unsigned int subtc[2];
2223                 unsigned int substep[2];
2224                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2225                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2226                 if(nextsub >= endx)
2227                 {
2228                         nextsub = endsub = endx-1;      
2229                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2230                 }
2231                 tc[0] = endtc[0];
2232                 tc[1] = endtc[1];
2233                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2234                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2235                 substep[0] = (endtc[0] - tc[0]) * subscale;
2236                 substep[1] = (endtc[1] - tc[1]) * subscale;
2237                 subtc[0] = tc[0] * (1<<16);
2238                 subtc[1] = tc[1] * (1<<16);
2239                 if(filter)
2240                 {
2241                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2242                         {
2243                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2244                                 {
2245                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2246                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2247                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2248                                         tci[0] = subtc[0]>>16;
2249                                         tci[1] = subtc[1]>>16;
2250                                         tci1[0] = tci[0] + 1;
2251                                         tci1[1] = tci[1] + 1;
2252                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2253                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2254                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2255                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2256                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2257                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2258                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2259                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2260                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2261                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2262                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2263                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2264                                         out4f[x*4+0] = c[0];
2265                                         out4f[x*4+1] = c[1];
2266                                         out4f[x*4+2] = c[2];
2267                                         out4f[x*4+3] = c[3];
2268                                 }
2269                         }
2270                         else
2271                         {
2272                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2273                                 {
2274                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2275                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2276                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2277                                         tci[0] = subtc[0]>>16;
2278                                         tci[1] = subtc[1]>>16;
2279                                         tci1[0] = tci[0] + 1;
2280                                         tci1[1] = tci[1] + 1;
2281                                         tci[0] &= tciwrapmask[0];
2282                                         tci[1] &= tciwrapmask[1];
2283                                         tci1[0] &= tciwrapmask[0];
2284                                         tci1[1] &= tciwrapmask[1];
2285                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2286                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2287                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2288                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2289                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2290                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2291                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2292                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2293                                         out4f[x*4+0] = c[0];
2294                                         out4f[x*4+1] = c[1];
2295                                         out4f[x*4+2] = c[2];
2296                                         out4f[x*4+3] = c[3];
2297                                 }
2298                         }
2299                 }
2300                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2301                 {
2302                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2303                         {
2304                                 tci[0] = subtc[0]>>16;
2305                                 tci[1] = subtc[1]>>16;
2306                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2307                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2308                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2309                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2310                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2311                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2312                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2313                                 out4f[x*4+0] = c[0];
2314                                 out4f[x*4+1] = c[1];
2315                                 out4f[x*4+2] = c[2];
2316                                 out4f[x*4+3] = c[3];
2317                         }
2318                 }
2319                 else
2320                 {
2321                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2322                         {
2323                                 tci[0] = subtc[0]>>16;
2324                                 tci[1] = subtc[1]>>16;
2325                                 tci[0] &= tciwrapmask[0];
2326                                 tci[1] &= tciwrapmask[1];
2327                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2328                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2329                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2330                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2331                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2332                                 out4f[x*4+0] = c[0];
2333                                 out4f[x*4+1] = c[1];
2334                                 out4f[x*4+2] = c[2];
2335                                 out4f[x*4+3] = c[3];
2336                         }
2337                 }
2338         }
2339 }
2340
2341 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2342 {
2343 #ifdef SSE2_PRESENT
2344         int x;
2345         int startx = span->startx;
2346         int endx = span->endx;
2347         int flags;
2348         __m128 data, slope, tcscale;
2349         __m128i tcsize, tcmask, tcoffset, tcmax;
2350         __m128 tc, endtc;
2351         __m128i subtc, substep, endsubtc;
2352         int filter;
2353         int mip;
2354         unsigned int *outi = (unsigned int *)out4ub;
2355         const unsigned char * RESTRICT pixelbase;
2356         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2357         // if no texture is bound, just fill it with white
2358         if (!texture)
2359         {
2360                 memset(out4ub + startx*4, 255, span->length*4);
2361                 return;
2362         }
2363         mip = triangle->mip[texunitindex];
2364         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2365         // if this mipmap of the texture is 1 pixel, just fill it with that color
2366         if (texture->mipmap[mip][1] == 4)
2367         {
2368                 unsigned int k = *((const unsigned int *)pixelbase);
2369                 for (x = startx;x < endx;x++)
2370                         outi[x] = k;
2371                 return;
2372         }
2373         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2374         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2375         flags = texture->flags;
2376         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2377         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2378         tcscale = _mm_cvtepi32_ps(tcsize);
2379         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2380         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2381         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2382         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2383         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2384         tcmax = filter ? _mm_packs_epi32(tcmask, tcmask) : _mm_slli_epi32(tcmask, 16);  
2385         for (x = startx;x < endx;)
2386         {
2387                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2388                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2389                 if(nextsub >= endx)
2390                 {
2391                         nextsub = endsub = endx-1;
2392                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2393                 }       
2394                 tc = endtc;
2395                 subtc = endsubtc;
2396                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2397                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2398                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2399                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2400                 substep = _mm_slli_epi32(substep, 1);
2401                 if (filter)
2402                 {
2403                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2404                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2405                         {
2406                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2407                                 {
2408                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2409                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0x10000)), tcoffset);
2410                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2411                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2412                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), _mm_setzero_si128());
2413                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))]), _mm_setzero_si128());
2414                                         fracm = _mm_srli_epi16(subtc, 1);
2415                                         pix1 = _mm_add_epi16(pix1,
2416                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2417                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2418                                         pix3 = _mm_add_epi16(pix3,
2419                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2420                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2421                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2422                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2423                                         pix2 = _mm_add_epi16(pix2,
2424                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2425                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2426                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2427                                 }
2428                                 if (x <= endsub)
2429                                 {
2430                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2431                                         tci = _mm_madd_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 0x10000, 0, 0)), tcoffset);
2432                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(tci)]), _mm_setzero_si128());
2433                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))]), _mm_setzero_si128());
2434                                         fracm = _mm_srli_epi16(subtc, 1);
2435                                         pix1 = _mm_add_epi16(pix1,
2436                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2437                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2438                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2439                                         pix1 = _mm_add_epi16(pix1,
2440                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2441                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2442                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2443                                         x++;
2444                                 }
2445                         }
2446                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2447                         {
2448                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2449                                 {
2450                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2451                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2452                                         tci = _mm_madd_epi16(tci, tcoffset);
2453                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2454                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2455                                                                                         _mm_setzero_si128());
2456                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2457                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2458                                                                                         _mm_setzero_si128());
2459                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2460                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2461                                         tci = _mm_madd_epi16(tci, tcoffset);
2462                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2463                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2464                                                                                         _mm_setzero_si128());
2465                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2466                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2467                                                                                         _mm_setzero_si128());
2468                                         fracm = _mm_srli_epi16(subtc, 1);
2469                                         pix1 = _mm_add_epi16(pix1,
2470                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2471                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2472                                         pix3 = _mm_add_epi16(pix3,
2473                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2474                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2475                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2476                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2477                                         pix2 = _mm_add_epi16(pix2,
2478                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2479                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2480                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2481                                 }
2482                                 if (x <= endsub)
2483                                 {
2484                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2485                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2486                                         tci = _mm_madd_epi16(tci, tcoffset);
2487                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2488                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2489                                                                                         _mm_setzero_si128());
2490                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2491                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2492                                                                                         _mm_setzero_si128());
2493                                         fracm = _mm_srli_epi16(subtc, 1);
2494                                         pix1 = _mm_add_epi16(pix1,
2495                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2496                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2497                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2498                                         pix1 = _mm_add_epi16(pix1,
2499                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2500                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2501                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2502                                         x++;
2503                                 }
2504                         }
2505                         else
2506                         {
2507                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2508                                 {
2509                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2510                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2511                                         tci = _mm_madd_epi16(tci, tcoffset);
2512                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2513                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2514                                                                                         _mm_setzero_si128());
2515                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2516                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2517                                                                                         _mm_setzero_si128());
2518                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2519                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2520                                         tci = _mm_madd_epi16(tci, tcoffset);
2521                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2522                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2523                                                                                         _mm_setzero_si128());
2524                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2525                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2526                                                                                         _mm_setzero_si128());
2527                                         fracm = _mm_srli_epi16(subtc, 1);
2528                                         pix1 = _mm_add_epi16(pix1,
2529                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2530                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2531                                         pix3 = _mm_add_epi16(pix3,
2532                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2533                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2534                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2535                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2536                                         pix2 = _mm_add_epi16(pix2,
2537                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2538                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2539                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2540                                 }
2541                                 if (x <= endsub)
2542                                 {
2543                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2544                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2545                                         tci = _mm_madd_epi16(tci, tcoffset);
2546                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2547                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2548                                                                                         _mm_setzero_si128());
2549                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2550                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2551                                                                                         _mm_setzero_si128());
2552                                         fracm = _mm_srli_epi16(subtc, 1);
2553                                         pix1 = _mm_add_epi16(pix1,
2554                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2557                                         pix1 = _mm_add_epi16(pix1,
2558                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2559                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2560                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2561                                         x++;
2562                                 }
2563                         }
2564                 }
2565                 else
2566                 {
2567                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2568                         {
2569                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2570                                 {
2571                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax); 
2572                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2573                                         tci = _mm_madd_epi16(tci, tcoffset);
2574                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2575                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2576                                 }
2577                                 if (x <= endsub)
2578                                 {
2579                                         __m128i tci = _mm_min_epi16(_mm_max_epi16(subtc, _mm_setzero_si128()), tcmax);
2580                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2581                                         tci = _mm_madd_epi16(tci, tcoffset);
2582                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2583                                         x++;
2584                                 }
2585                         }
2586                         else
2587                         {
2588                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2589                                 {
2590                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
2591                                         tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2592                                         tci = _mm_madd_epi16(tci, tcoffset);
2593                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2594                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))];
2595                                 }
2596                                 if (x <= endsub)
2597                                 {
2598                                         __m128i tci = _mm_and_si128(subtc, tcmax); 
2599                                         tci = _mm_shufflelo_epi16(tci, _MM_SHUFFLE(3, 1, 3, 1));
2600                                         tci = _mm_madd_epi16(tci, tcoffset);
2601                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2602                                         x++;
2603                                 }
2604                         }
2605                 }
2606         }
2607 #endif
2608 }
2609
2610 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2611 {
2612         // TODO: IMPLEMENT
2613         memset(out4ub, 255, span->length*4);
2614 }
2615
2616 float DPSOFTRAST_SampleShadowmap(const float *vector)
2617 {
2618         // TODO: IMPLEMENT
2619         return 1.0f;
2620 }
2621
2622 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2623 {
2624         int x;
2625         int startx = span->startx;
2626         int endx = span->endx;
2627         float c[4];
2628         float data[4];
2629         float slope[4];
2630         float z;
2631         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2632         for (x = startx;x < endx;x++)
2633         {
2634                 z = zf[x];
2635                 c[0] = (data[0] + slope[0]*x) * z;
2636                 c[1] = (data[1] + slope[1]*x) * z;
2637                 c[2] = (data[2] + slope[2]*x) * z;
2638                 c[3] = (data[3] + slope[3]*x) * z;
2639                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2640                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2641                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2642                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2643         }
2644 }
2645
2646 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2647 {
2648         int x;
2649         int startx = span->startx;
2650         int endx = span->endx;
2651         float c[4];
2652         float data[4];
2653         float slope[4];
2654         float z;
2655         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2656         for (x = startx;x < endx;x++)
2657         {
2658                 z = zf[x];
2659                 c[0] = (data[0] + slope[0]*x) * z;
2660                 c[1] = (data[1] + slope[1]*x) * z;
2661                 c[2] = (data[2] + slope[2]*x) * z;
2662                 c[3] = (data[3] + slope[3]*x) * z;
2663                 out4f[x*4+0] = c[0];
2664                 out4f[x*4+1] = c[1];
2665                 out4f[x*4+2] = c[2];
2666                 out4f[x*4+3] = c[3];
2667         }
2668 }
2669
2670 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2671 {
2672         int x, startx = span->startx, endx = span->endx;
2673         float c[4], localcolor[4];
2674         localcolor[0] = subcolor[0];
2675         localcolor[1] = subcolor[1];
2676         localcolor[2] = subcolor[2];
2677         localcolor[3] = subcolor[3];
2678         for (x = startx;x < endx;x++)
2679         {
2680                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2681                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2682                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2683                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2684                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2685                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2686                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2687                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2688         }
2689 }
2690
2691 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2692 {
2693         int x, startx = span->startx, endx = span->endx;
2694         for (x = startx;x < endx;x++)
2695         {
2696                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2697                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2698                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2699                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2700         }
2701 }
2702
2703 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2704 {
2705         int x, startx = span->startx, endx = span->endx;
2706         for (x = startx;x < endx;x++)
2707         {
2708                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2709                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2710                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2711                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2712         }
2713 }
2714
2715 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2716 {
2717         int x, startx = span->startx, endx = span->endx;
2718         float a, b;
2719         for (x = startx;x < endx;x++)
2720         {
2721                 a = 1.0f - inb4f[x*4+3];
2722                 b = inb4f[x*4+3];
2723                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2724                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2725                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2726                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2727         }
2728 }
2729
2730 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2731 {
2732         int x, startx = span->startx, endx = span->endx;
2733         float localcolor[4], ilerp, lerp;
2734         localcolor[0] = color[0];
2735         localcolor[1] = color[1];
2736         localcolor[2] = color[2];
2737         localcolor[3] = color[3];
2738         ilerp = 1.0f - localcolor[3];
2739         lerp = localcolor[3];
2740         for (x = startx;x < endx;x++)
2741         {
2742                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2743                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2744                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2745                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2746         }
2747 }
2748
2749
2750
2751 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2752 {
2753 #ifdef SSE2_PRESENT
2754         int x;
2755         int startx = span->startx;
2756         int endx = span->endx;
2757         __m128 data, slope;
2758         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2759         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2760         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2761         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2762         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2763         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2764         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2765         {
2766                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2767                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2768                 data = _mm_add_ps(data, slope);
2769                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2770                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2771                 pix = _mm_mulhi_epu16(pix, mod);
2772                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2773         }
2774         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2775         {
2776                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2777                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2778                 mod = _mm_packs_epi32(mod, mod);
2779                 pix = _mm_mulhi_epu16(pix, mod);
2780                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2781         }
2782 #endif
2783 }
2784
2785 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2786 {
2787 #ifdef SSE2_PRESENT
2788         int x;
2789         int startx = span->startx;
2790         int endx = span->endx;
2791         __m128 data, slope;
2792         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2793         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2794         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2795         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2796         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2797         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2798         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2799         {
2800                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2801                 data = _mm_add_ps(data, slope);
2802                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2803                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2804                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2805         }
2806         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2807         {
2808                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2809                 pix = _mm_packs_epi32(pix, pix);
2810                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2811         }
2812 #endif
2813 }
2814
2815 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2816 {
2817 #ifdef SSE2_PRESENT
2818         int x, startx = span->startx, endx = span->endx;
2819         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2820         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2821         for (x = startx;x+2 <= endx;x+=2)
2822         {
2823                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2824                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2825                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2826                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2827         }
2828         if(x < endx)
2829         {
2830                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2831                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2832                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2833                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2834         }
2835 #endif
2836 }
2837
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2839 {
2840 #ifdef SSE2_PRESENT
2841         int x, startx = span->startx, endx = span->endx;
2842         for (x = startx;x+2 <= endx;x+=2)
2843         {
2844                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2845                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2846                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2847                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2848         }
2849         if(x < endx)
2850         {
2851                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2852                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2853                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2854                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2855         }
2856 #endif
2857 }
2858
2859 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2860 {
2861 #ifdef SSE2_PRESENT
2862         int x, startx = span->startx, endx = span->endx;
2863         for (x = startx;x+2 <= endx;x+=2)
2864         {
2865                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2866                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2867                 pix1 = _mm_add_epi16(pix1, pix2);
2868                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2869         }
2870         if(x < endx)
2871         {
2872                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2873                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2874                 pix1 = _mm_add_epi16(pix1, pix2);
2875                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2876         }
2877 #endif
2878 }
2879
2880 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2881 {
2882 #ifdef SSE2_PRESENT
2883         int x, startx = span->startx, endx = span->endx;
2884         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2885         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2886         for (x = startx;x+2 <= endx;x+=2)
2887         {
2888                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2889                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2890                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2891                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2892         }
2893         if(x < endx)
2894         {
2895                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2896                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2897                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2898                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2899         }
2900 #endif
2901 }
2902
2903 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2904 {
2905 #ifdef SSE2_PRESENT
2906         int x, startx = span->startx, endx = span->endx;
2907         for (x = startx;x+2 <= endx;x+=2)
2908         {
2909                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2910                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2911                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2912                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2913                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2914         }
2915         if(x < endx)
2916         {
2917                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2918                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2919                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2920                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2921                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2922         }
2923 #endif
2924 }
2925
2926 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2927 {
2928 #ifdef SSE2_PRESENT
2929         int x, startx = span->startx, endx = span->endx;
2930         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2931         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2932         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2933         for (x = startx;x+2 <= endx;x+=2)
2934         {
2935                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2936                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2937                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2938         }
2939         if(x < endx)
2940         {
2941                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2942                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2943                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2944         }
2945 #endif
2946 }
2947
2948
2949
2950 void DPSOFTRAST_VertexShader_Generic(void)
2951 {
2952         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
2953         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
2954         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
2955         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
2956                 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
2957 }
2958
2959 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
2960 {
2961         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
2962         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2963         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2964         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
2965         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
2966         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
2967         {
2968                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
2969                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
2970                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
2971                 {
2972                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
2973                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2974                         {
2975                                 // multiply
2976                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2977                         }
2978                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
2979                         {
2980                                 // add
2981                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2982                         }
2983                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
2984                         {
2985                                 // alphablend
2986                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
2987                         }
2988                 }
2989         }
2990         else
2991                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
2992         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
2993 }
2994
2995
2996
2997 void DPSOFTRAST_VertexShader_PostProcess(void)
2998 {
2999         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3000         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3001         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3002 }
3003
3004 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3005 {
3006         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3007         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3008         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3009         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3010         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3011         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3012         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3013         {
3014                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3015                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3016         }
3017         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3018         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3019         {
3020                 // TODO: implement saturation
3021         }
3022         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3023         {
3024                 // TODO: implement gammaramps
3025         }
3026         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3027 }
3028
3029
3030
3031 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3032 {
3033         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3034 }
3035
3036 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3037 {
3038         // this is never called (because colormask is off when this shader is used)
3039         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3040         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3041         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3042         memset(buffer_FragColorbgra8, 0, span->length*4);
3043         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3044 }
3045
3046
3047
3048 void DPSOFTRAST_VertexShader_FlatColor(void)
3049 {
3050         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3051         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3052 }
3053
3054 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3055 {
3056         int x, startx = span->startx, endx = span->endx;
3057         int Color_Ambienti[4];
3058         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3059         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3060         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3061         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3062         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3063         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3064         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3065         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3066         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3067         for (x = startx;x < endx;x++)
3068         {
3069                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3070                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3071                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3072                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3073         }
3074         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3075 }
3076
3077
3078
3079 void DPSOFTRAST_VertexShader_VertexColor(void)
3080 {
3081         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3082         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3083         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3084 }
3085
3086 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3087 {
3088 #ifdef SSE2_PRESENT
3089         unsigned char * RESTRICT pixelmask = span->pixelmask;
3090         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3091         int x, startx = span->startx, endx = span->endx;
3092         __m128i Color_Ambientm, Color_Diffusem;
3093         __m128 data, slope;
3094         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3095         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3096         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3097         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3098         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3099         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3100         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3101                 pixel = buffer_FragColorbgra8;
3102         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3103         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3104         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3105         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3106         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3107         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3108         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3109         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3110         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3111         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3112         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3113         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3114         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3115         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3116         {
3117                 __m128i color, mod, pix;
3118                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3119                 {
3120                         __m128i pix2, mod2;
3121                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3122                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3123                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3124                         data = _mm_add_ps(data, slope);
3125                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3126                         data = _mm_add_ps(data, slope);
3127                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3128                         data = _mm_add_ps(data, slope);
3129                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3130                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3131                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3132                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3133                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3134                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3135                         x += 3;
3136                         continue;
3137                 }
3138                 if(!pixelmask[x])
3139                         continue;
3140                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3141                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3142                 mod = _mm_packs_epi32(mod, mod);
3143                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3144                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3145         }
3146         if(pixel == buffer_FragColorbgra8)
3147                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3148 #endif
3149 }
3150
3151
3152
3153 void DPSOFTRAST_VertexShader_Lightmap(void)
3154 {
3155         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3156         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3157         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3158 }
3159
3160 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3161 {
3162 #ifdef SSE2_PRESENT
3163         unsigned char * RESTRICT pixelmask = span->pixelmask;
3164         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3165         int x, startx = span->startx, endx = span->endx;
3166         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3167         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3168         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3169         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3171         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3172         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3173         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3174         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3175         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3176                 pixel = buffer_FragColorbgra8;
3177         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3178         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3179         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3180         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3181         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3182         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3183         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3184         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3185         {
3186                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3187                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3188                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3189                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3190                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3191                 for (x = startx;x < endx;x++)
3192                 {
3193                         __m128i color, lightmap, glow, pix;
3194                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3195                         {
3196                                 __m128i pix2;
3197                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3198                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3199                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3200                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3201                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3202                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3203                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3204                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3205                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3206                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3207                                 x += 3;
3208                                 continue;
3209                         }
3210                         if(!pixelmask[x])
3211                                 continue;
3212                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3213                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3214                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3215                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3216                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3217                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3218                 }
3219         }
3220         else
3221         {
3222                 for (x = startx;x < endx;x++)
3223                 {
3224                         __m128i color, lightmap, pix;
3225                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3226                         {
3227                                 __m128i pix2;
3228                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3229                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3230                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3231                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3232                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3233                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3234                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3235                                 x += 3;
3236                                 continue;
3237                         }
3238                         if(!pixelmask[x]) 
3239                                 continue;
3240                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3241                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3242                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3243                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3244                 }
3245         }
3246         if(pixel == buffer_FragColorbgra8)
3247                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3248 #endif
3249 }
3250
3251
3252
3253 void DPSOFTRAST_VertexShader_FakeLight(void)
3254 {
3255         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3256 }
3257
3258 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3259 {
3260         // TODO: IMPLEMENT
3261         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3262         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3263         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3264         memset(buffer_FragColorbgra8, 0, span->length*4);
3265         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3266 }
3267
3268
3269
3270 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3271 {
3272         DPSOFTRAST_VertexShader_Lightmap();
3273 }
3274
3275 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 {
3277         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3278         // TODO: IMPLEMENT
3279 }
3280
3281
3282
3283 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3284 {
3285         DPSOFTRAST_VertexShader_Lightmap();
3286 }
3287
3288 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3289 {
3290         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3291         // TODO: IMPLEMENT
3292 }
3293
3294
3295
3296 void DPSOFTRAST_VertexShader_LightDirection(void)
3297 {
3298         int i;
3299         int numvertices = dpsoftrast.numvertices;
3300         float LightDir[4];
3301         float LightVector[4];
3302         float EyePosition[4];
3303         float EyeVectorModelSpace[4];
3304         float EyeVector[4];
3305         float position[4];
3306         float svector[4];
3307         float tvector[4];
3308         float normal[4];
3309         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3310         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3311         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3312         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3313         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3314         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3315         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3316         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3317         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3318         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3319         for (i = 0;i < numvertices;i++)
3320         {
3321                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3322                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3323                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3324                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3325                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3326                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3327                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3328                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3329                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3330                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3331                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3332                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3333                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3334                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3335                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3336                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3337                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3338                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3339                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3340                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3341                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3342                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3343                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3344                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3345                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3346                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3347                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3348                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3349                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3350         }
3351 }
3352
3353 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3354 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3355 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3356 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3357 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3358 #define DPSOFTRAST_Vector3Normalize(v)\
3359 do\
3360 {\
3361         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3362         if (len)\
3363         {\
3364                 len = 1.0f / len;\
3365                 v[0] *= len;\
3366                 v[1] *= len;\
3367                 v[2] *= len;\
3368         }\
3369 }\
3370 while(0)
3371
3372 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3373 {
3374         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3375         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382         int x, startx = span->startx, endx = span->endx;
3383         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3384         float LightVectordata[4];
3385         float LightVectorslope[4];
3386         float EyeVectordata[4];
3387         float EyeVectorslope[4];
3388         float z;
3389         float diffusetex[4];
3390         float glosstex[4];
3391         float surfacenormal[4];
3392         float lightnormal[4];
3393         float eyenormal[4];
3394         float specularnormal[4];
3395         float diffuse;
3396         float specular;
3397         float SpecularPower;
3398         int d[4];
3399         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3400         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3401         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3402         Color_Glow[3] = 0.0f;
3403         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3404         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3405         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3406         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3407         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3408         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3409         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3410         Color_Pants[3] = 0.0f;
3411         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3412         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3413         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3414         Color_Shirt[3] = 0.0f;
3415         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3416         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3417         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3418         {
3419                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3420                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3421         }
3422         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3423         {
3424                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3425         }
3426         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3427         {
3428                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3429                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3430                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3431                 Color_Diffuse[3] = 0.0f;
3432                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3433                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3434                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3435                 LightColor[3] = 0.0f;
3436                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3437                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3438                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3439                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3440                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3441                 Color_Specular[3] = 0.0f;
3442                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3443                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3444                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3445                 for (x = startx;x < endx;x++)
3446                 {
3447                         z = buffer_z[x];
3448                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3449                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3450                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3451                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3452                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3453                         {
3454                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3455                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3456                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3457                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3458                         }
3459                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3460                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3461                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3462                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3463                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3464                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3465                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3466                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3467
3468                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3469                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3470                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3471                         DPSOFTRAST_Vector3Normalize(lightnormal);
3472
3473                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3474                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3475                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3476                         DPSOFTRAST_Vector3Normalize(eyenormal);
3477
3478                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3479                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3480                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3481                         DPSOFTRAST_Vector3Normalize(specularnormal);
3482
3483                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3484                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3485                         specular = pow(specular, SpecularPower * glosstex[3]);
3486                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3487                         {
3488                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3489                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3490                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3491                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3492                         }
3493                         else
3494                         {
3495                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3496                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3497                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3498                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3499                         }
3500                         buffer_FragColorbgra8[x*4+0] = d[0];
3501                         buffer_FragColorbgra8[x*4+1] = d[1];
3502                         buffer_FragColorbgra8[x*4+2] = d[2];
3503                         buffer_FragColorbgra8[x*4+3] = d[3];
3504                 }
3505         }
3506         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3507         {
3508                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3509                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3510                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3511                 Color_Diffuse[3] = 0.0f;
3512                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3513                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3514                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3515                 LightColor[3] = 0.0f;
3516                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3517                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3518                 for (x = startx;x < endx;x++)
3519                 {
3520                         z = buffer_z[x];
3521                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3522                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3523                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3524                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3525                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3526                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3527                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3528                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3529
3530                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3531                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3532                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3533                         DPSOFTRAST_Vector3Normalize(lightnormal);
3534
3535                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3536                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3537                         {
3538                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3539                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3540                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3541                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3542                         }
3543                         else
3544                         {
3545                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3546                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3547                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3548                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3549                         }
3550                         buffer_FragColorbgra8[x*4+0] = d[0];
3551                         buffer_FragColorbgra8[x*4+1] = d[1];
3552                         buffer_FragColorbgra8[x*4+2] = d[2];
3553                         buffer_FragColorbgra8[x*4+3] = d[3];
3554                 }
3555         }
3556         else
3557         {
3558                 for (x = startx;x < endx;x++)
3559                 {
3560                         z = buffer_z[x];
3561                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3562                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3563                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3564                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3565
3566                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3567                         {
3568                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3569                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3570                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3571                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3572                         }
3573                         else
3574                         {
3575                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3576                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3577                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3578                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3579                         }
3580                         buffer_FragColorbgra8[x*4+0] = d[0];
3581                         buffer_FragColorbgra8[x*4+1] = d[1];
3582                         buffer_FragColorbgra8[x*4+2] = d[2];
3583                         buffer_FragColorbgra8[x*4+3] = d[3];
3584                 }
3585         }
3586         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3587 }
3588
3589
3590
3591 void DPSOFTRAST_VertexShader_LightSource(void)
3592 {
3593         int i;
3594         int numvertices = dpsoftrast.numvertices;
3595         float LightPosition[4];
3596         float LightVector[4];
3597         float LightVectorModelSpace[4];
3598         float EyePosition[4];
3599         float EyeVectorModelSpace[4];
3600         float EyeVector[4];
3601         float position[4];
3602         float svector[4];
3603         float tvector[4];
3604         float normal[4];
3605         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3606         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3607         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3608         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3609         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3610         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3611         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3612         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3613         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3614         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3615         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3616         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3617         for (i = 0;i < numvertices;i++)
3618         {
3619                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3620                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3621                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3622                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3623                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3624                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3625                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3626                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3627                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3628                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3629                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3630                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3631                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3632                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3633                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3634                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3635                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3636                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3637                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3638                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3639                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3641                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3642                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3643                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3644                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3645                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3646                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3647                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3648                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3649                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3650                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3651         }
3652 }
3653
3654 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3655 {
3656         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3657         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3658         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         int x, startx = span->startx, endx = span->endx;
3665         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3666         float CubeVectordata[4];
3667         float CubeVectorslope[4];
3668         float LightVectordata[4];
3669         float LightVectorslope[4];
3670         float EyeVectordata[4];
3671         float EyeVectorslope[4];
3672         float z;
3673         float diffusetex[4];
3674         float glosstex[4];
3675         float surfacenormal[4];
3676         float lightnormal[4];
3677         float eyenormal[4];
3678         float specularnormal[4];
3679         float diffuse;
3680         float specular;
3681         float SpecularPower;
3682         float CubeVector[4];
3683         float attenuation;
3684         int d[4];
3685         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3686         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3687         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3688         Color_Glow[3] = 0.0f;
3689         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3690         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3691         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3692         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3693         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3694         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3695         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3696         Color_Diffuse[3] = 0.0f;
3697         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3698         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3699         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3700         Color_Specular[3] = 0.0f;
3701         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3702         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3703         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3704         Color_Pants[3] = 0.0f;
3705         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3706         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3707         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3708         Color_Shirt[3] = 0.0f;
3709         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3710         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3711         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3712         LightColor[3] = 0.0f;
3713         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3714         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3715         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3716         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3717         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3718         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3719         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3721         {
3722                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3723                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724         }
3725         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3726                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3727         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3728         {
3729                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3731                 for (x = startx;x < endx;x++)
3732                 {
3733                         z = buffer_z[x];
3734                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3735                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3736                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3737                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3738                         if (attenuation < 0.01f)
3739                                 continue;
3740                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3741                         {
3742                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3743                                 if (attenuation < 0.01f)
3744                                         continue;
3745                         }
3746
3747                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3748                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3749                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3750                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3751                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3752                         {
3753                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3754                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3755                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3756                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3757                         }
3758                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3759                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3760                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3761                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3762                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3763                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3764                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3765                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3766
3767                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3768                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3769                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3770                         DPSOFTRAST_Vector3Normalize(lightnormal);
3771
3772                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3773                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3774                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3775                         DPSOFTRAST_Vector3Normalize(eyenormal);
3776
3777                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3778                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3779                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3780                         DPSOFTRAST_Vector3Normalize(specularnormal);
3781
3782                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3783                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3784                         specular = pow(specular, SpecularPower * glosstex[3]);
3785                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3786                         {
3787                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3788                                 attenuation *= (1.0f / 255.0f);
3789                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3790                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3791                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3792                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3793                         }
3794                         else
3795                         {
3796                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3797                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3798                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3799                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3800                         }
3801                         buffer_FragColorbgra8[x*4+0] = d[0];
3802                         buffer_FragColorbgra8[x*4+1] = d[1];
3803                         buffer_FragColorbgra8[x*4+2] = d[2];
3804                         buffer_FragColorbgra8[x*4+3] = d[3];
3805                 }
3806         }
3807         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3808         {
3809                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3810                 for (x = startx;x < endx;x++)
3811                 {
3812                         z = buffer_z[x];
3813                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3814                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3815                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3816                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3817                         if (attenuation < 0.01f)
3818                                 continue;
3819                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3820                         {
3821                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3822                                 if (attenuation < 0.01f)
3823                                         continue;
3824                         }
3825
3826                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3827                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3828                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3829                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3830                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3831                         {
3832                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3833                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3834                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3835                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3836                         }
3837                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3838                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3839                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3840                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3841
3842                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3843                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3844                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3845                         DPSOFTRAST_Vector3Normalize(lightnormal);
3846
3847                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3848                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3849                         {
3850                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3851                                 attenuation *= (1.0f / 255.0f);
3852                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3853                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3854                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3855                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3856                         }
3857                         else
3858                         {
3859                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3860                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3861                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3862                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3863                         }
3864                         buffer_FragColorbgra8[x*4+0] = d[0];
3865                         buffer_FragColorbgra8[x*4+1] = d[1];
3866                         buffer_FragColorbgra8[x*4+2] = d[2];
3867                         buffer_FragColorbgra8[x*4+3] = d[3];
3868                 }
3869         }
3870         else
3871         {
3872                 for (x = startx;x < endx;x++)
3873                 {
3874                         z = buffer_z[x];
3875                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3876                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3877                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3878                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3879                         if (attenuation < 0.01f)
3880                                 continue;
3881                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3882                         {
3883                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3884                                 if (attenuation < 0.01f)
3885                                         continue;
3886                         }
3887
3888                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3889                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3890                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3891                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3892                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3893                         {
3894                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3895                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3896                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3897                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3898                         }
3899                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3900                         {
3901                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3902                                 attenuation *= (1.0f / 255.0f);
3903                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3904                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3905                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3906                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
3907                         }
3908                         else
3909                         {
3910                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3911                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3912                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3913                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3914                         }
3915                         buffer_FragColorbgra8[x*4+0] = d[0];
3916                         buffer_FragColorbgra8[x*4+1] = d[1];
3917                         buffer_FragColorbgra8[x*4+2] = d[2];
3918                         buffer_FragColorbgra8[x*4+3] = d[3];
3919                 }
3920         }
3921         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3922 }
3923
3924
3925
3926 void DPSOFTRAST_VertexShader_Refraction(void)
3927 {
3928         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3929 }
3930
3931 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3932 {
3933         // TODO: IMPLEMENT
3934         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3935         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3936         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3937         memset(buffer_FragColorbgra8, 0, span->length*4);
3938         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3939 }
3940
3941
3942
3943 void DPSOFTRAST_VertexShader_Water(void)
3944 {
3945         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3946 }
3947
3948
3949 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3950 {
3951         // TODO: IMPLEMENT
3952         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3953         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3954         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3955         memset(buffer_FragColorbgra8, 0, span->length*4);
3956         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3957 }
3958
3959
3960
3961 void DPSOFTRAST_VertexShader_ShowDepth(void)
3962 {
3963         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3964 }
3965
3966 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3967 {
3968         // TODO: IMPLEMENT
3969         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3970         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3971         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3972         memset(buffer_FragColorbgra8, 0, span->length*4);
3973         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3974 }
3975
3976
3977
3978 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
3979 {
3980         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3981 }
3982
3983 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3984 {
3985         // TODO: IMPLEMENT
3986         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3987         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3988         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3989         memset(buffer_FragColorbgra8, 0, span->length*4);
3990         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3991 }
3992
3993
3994
3995 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
3996 {
3997         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3998 }
3999
4000 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4001 {
4002         // TODO: IMPLEMENT
4003         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4004         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4005         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4006         memset(buffer_FragColorbgra8, 0, span->length*4);
4007         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4008 }
4009
4010
4011
4012 typedef struct DPSOFTRAST_ShaderModeInfo_s
4013 {
4014         int lodarrayindex;
4015         void (*Vertex)(void);
4016         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4017         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4018         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4019 }
4020 DPSOFTRAST_ShaderModeInfo;
4021
4022 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4023 {
4024         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4025         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4026         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4027         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4028         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4029         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4030         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4031         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4032         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4033         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4034         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4035         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4036         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4037         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4038         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4039         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4040 };
4041
4042
4043 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4044 {
4045         while (commandoffset != endoffset)
4046         {
4047                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4048                 switch (command->opcode)
4049                 {
4050 #define INTERPCOMMAND(name) \
4051                 case DPSOFTRAST_OPCODE_##name : \
4052                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4053                         commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4054                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4055                                 commandoffset = 0; \
4056                         break;
4057                 INTERPCOMMAND(Viewport)
4058                 INTERPCOMMAND(ClearColor)
4059                 INTERPCOMMAND(ClearDepth)
4060                 INTERPCOMMAND(ColorMask)
4061                 INTERPCOMMAND(DepthTest)
4062                 INTERPCOMMAND(ScissorTest)
4063                 INTERPCOMMAND(Scissor)
4064                 INTERPCOMMAND(BlendFunc)
4065                 INTERPCOMMAND(BlendSubtract)
4066                 INTERPCOMMAND(DepthMask)
4067                 INTERPCOMMAND(DepthFunc)
4068                 INTERPCOMMAND(DepthRange)
4069                 INTERPCOMMAND(PolygonOffset)
4070                 INTERPCOMMAND(AlphaTest)
4071                 INTERPCOMMAND(AlphaFunc)
4072                 INTERPCOMMAND(SetTexture)
4073                 INTERPCOMMAND(SetShader)
4074                 INTERPCOMMAND(Uniform4f)
4075                 INTERPCOMMAND(UniformMatrix4f)
4076                 INTERPCOMMAND(Uniform1i)
4077
4078                 case DPSOFTRAST_OPCODE_Reset:
4079                         commandoffset = 0;
4080                         break;
4081                 }
4082         }
4083         return commandoffset;
4084 }
4085                                         
4086 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4087 {
4088         int i;
4089         int x;
4090         int startx;
4091         int endx;
4092 //      unsigned int c;
4093 //      unsigned int *colorpixel;
4094         unsigned int *depthpixel;
4095         float w;
4096         float wslope;
4097         int depth;
4098         int depthslope;
4099         unsigned int d;
4100         DPSOFTRAST_State_Triangle *triangle;
4101         DPSOFTRAST_State_Span *span;
4102         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4103         for (i = 0; i < thread->numspans; i++)
4104         {
4105                 span = &thread->spans[i];
4106                 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4107                 if (commandoffset != triangle->commandoffset)
4108                 {
4109                         commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4110                         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4111                 }
4112                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4113                 {
4114                         wslope = triangle->w[0];
4115                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4116                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4117                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4118                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4119                         switch(thread->fb_depthfunc)
4120                         {
4121                         default:
4122                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4123                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4124                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4125                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4126                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4127                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4128                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4129                         }
4130                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4131                         //for (x = 0;x < span->length;x++)
4132                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4133                         // if there is no color buffer, skip pixel shader
4134                         startx = 0;
4135                         endx = span->length;
4136                         while (startx < endx && !pixelmask[startx])
4137                                 startx++;
4138                         while (endx > startx && !pixelmask[endx-1])
4139                                 endx--;
4140                         if (startx >= endx)
4141                                 continue; // no pixels to fill
4142                         span->pixelmask = pixelmask;
4143                         span->startx = startx;
4144                         span->endx = endx;
4145                         // run pixel shader if appropriate
4146                         // do this before running depthmask code, to allow the pixelshader
4147                         // to clear pixelmask values for alpha testing
4148                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4149                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4150                         if (thread->depthmask)
4151                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4152                                         if (pixelmask[x])
4153                                                 depthpixel[x] = d;
4154                 }
4155                 else
4156                 {
4157                         // no depth testing means we're just dealing with color...
4158                         // if there is no color buffer, skip pixel shader
4159                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4160                         {
4161                                 memset(pixelmask, 1, span->length);
4162                                 span->pixelmask = pixelmask;
4163                                 span->startx = 0;
4164                                 span->endx = span->length;
4165                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4166                         }
4167                 }
4168         }
4169         thread->numspans = 0;
4170         return commandoffset;
4171 }
4172
4173 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4174 {
4175         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4176         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4177         int commandoffset = thread->commandoffset;
4178         int triangleoffset = thread->triangleoffset;
4179         DPSOFTRAST_State_Triangle *triangle = NULL;
4180         int starty;
4181         int endy;
4182         int y;
4183         int numpoints;
4184         __m128 coords[4];
4185         __m128i ycoords;
4186         while (triangleoffset != freetriangle)
4187         {
4188                 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4189                 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4190                         triangleoffset = 0;
4191                 starty = triangle->starty + 1;
4192                 endy = triangle->endy;
4193                 if (starty >= maxy || endy <= miny)
4194                         continue;
4195                 numpoints = triangle->numpoints;
4196                 coords[0] = _mm_load_ps(triangle->coords[0]);
4197                 coords[1] = _mm_load_ps(triangle->coords[1]);
4198                 coords[2] = _mm_load_ps(triangle->coords[2]);
4199                 coords[3] = _mm_load_ps(triangle->coords[3]);
4200                 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4201                 if (starty < miny)
4202                         starty = miny;
4203                 if (endy > maxy)
4204                         endy = maxy;
4205                 for (y = starty; y < endy;)
4206                 {
4207                         __m128 xcoords, xslope;
4208                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4209                         int yccmask = _mm_movemask_epi8(ycc);
4210                         int edge0p, edge0n, edge1p, edge1n;
4211                         int nexty;
4212                         if (numpoints == 4)
4213                         {
4214                                 switch(yccmask)
4215                                 {
4216                                 default:
4217                                 case 0xFFFF: /*0000*/ y = endy; continue;
4218                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4219                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4220                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4221                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4222                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4223                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4224                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4225                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4226                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4227                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4228                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4229                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4230                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4231                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4232                                 case 0x0000: /*1111*/ y++; continue;
4233                                 }
4234                         }
4235                         else
4236                         {
4237                                 switch(yccmask)
4238                                 {
4239                                 default:
4240                                 case 0xFFFF: /*000*/ y = endy; continue;
4241                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4242                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4243                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4244                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4245                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4246                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4247                                 case 0x0000: /*111*/ y++; continue;
4248                                 }
4249                         }
4250                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4251                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4252                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4253                         nexty = _mm_extract_epi16(ycc, 0);
4254                         if(nexty >= endy) nexty = endy-1;
4255                         if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4256                         {
4257                                 int tmp = edge0n;
4258                                 edge0n = edge1n;
4259                                 edge1n = tmp;
4260                                 tmp = edge0p;
4261                                 edge0p = edge1p;
4262                                 edge1p = tmp;
4263                         }
4264                         xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4265                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4266                         xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4267                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4268                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4269                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4270                         {
4271                                 int startx, endx, offset;
4272                                 startx = _mm_cvtss_si32(xcoords);
4273                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4274                                 if (startx < 0) startx = 0;
4275                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4276                                 if (startx >= endx) continue;
4277                                 for (offset = startx; offset < endx;)
4278                                 {
4279                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4280                                         span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4281                                         span->x = offset;
4282                                         span->y = y;
4283                                         span->length = endx - offset;
4284                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4285                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4286                                         offset += span->length;
4287                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4288                                                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4289                                 }
4290                         }
4291                 }
4292         }
4293
4294         if (thread->numspans > 0)
4295                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4296         if (commandoffset != triangle->commandoffset)
4297         {
4298                 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4299                 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4300         }
4301         
4302         MEMORY_BARRIER;
4303
4304         thread->commandoffset = commandoffset;
4305         thread->triangleoffset = triangleoffset;
4306 }
4307
4308 void DPSOFTRAST_Draw_FlushThreads(void)
4309 {
4310         DPSOFTRAST_State_Thread *thread;
4311         int i;
4312         if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4313         {
4314                 MEMORY_BARRIER;
4315                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4316         }
4317 #ifdef USETHREADS
4318         SDL_LockMutex(dpsoftrast.trianglemutex);
4319 #endif
4320         for (i = 0; i < dpsoftrast.numthreads; i++)
4321         {
4322                 thread = &dpsoftrast.threads[i];
4323 #ifdef USETHREADS
4324                 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4325                 {
4326                         thread->waiting = true;
4327                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4328                         SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4329                         thread->waiting = false;
4330                 }
4331 #else
4332                 if (thread->triangleoffset != dpsoftrast.drawtriangle) 
4333                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4334 #endif
4335         }
4336 #ifdef USETHREADS
4337         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4338 #endif
4339         dpsoftrast.trianglepool.usedtriangles = 0;
4340         dpsoftrast.commandpool.usedcommands = 0;
4341 }
4342
4343 #ifdef USETHREADS
4344 static int DPSOFTRAST_Draw_Thread(void *data)
4345 {
4346         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4347         while(thread->index >= 0)
4348         {
4349                 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4350                 {
4351                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle); 
4352                 }
4353                 else 
4354                 {
4355                         SDL_LockMutex(dpsoftrast.trianglemutex);
4356                         if (thread->triangleoffset != dpsoftrast.drawtriangle)
4357                         {
4358                                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4359                                 continue;
4360                         }
4361                         if (thread->waiting) SDL_CondSignal(thread->waitcond);
4362                         SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4363                         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4364                 }
4365         }   
4366         return 0;
4367 }
4368 #endif
4369
4370 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4371 {
4372 #ifdef SSE2_PRESENT
4373         int cullface = dpsoftrast.cullface;
4374         int width = dpsoftrast.fb_width;
4375         int height = dpsoftrast.fb_height;
4376         __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4377         DPSOFTRAST_State_Triangle *triangle;
4378         int numqueued = 0;
4379         int i;
4380         int j;
4381         int k;
4382         int y;
4383         int e[3];
4384         __m128i screeny;
4385         int starty, endy;
4386         int numpoints;
4387         int clipcase;
4388         float clipdist[4];
4389         __m128 triangleedge1, triangleedge2, trianglenormal;
4390         __m128 clipfrac[3];
4391         __m128 screen[4];
4392         DPSOFTRAST_Texture *texture;
4393         screen[3] = _mm_setzero_ps();
4394         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps(); 
4395         for (i = 0;i < numtriangles;i++)
4396         {
4397                 // generate the 3 edges of this triangle
4398                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4399                 if (element3i)
4400                 {
4401                         e[0] = element3i[i*3+0] - firstvertex;
4402                         e[1] = element3i[i*3+1] - firstvertex;
4403                         e[2] = element3i[i*3+2] - firstvertex;
4404                 }
4405                 else if (element3s)
4406                 {
4407                         e[0] = element3s[i*3+0] - firstvertex;
4408                         e[1] = element3s[i*3+1] - firstvertex;
4409                         e[2] = element3s[i*3+2] - firstvertex;
4410                 }
4411                 else
4412                 {
4413                         e[0] = i*3+0;
4414                         e[1] = i*3+1;
4415                         e[2] = i*3+2;
4416                 }
4417
4418 #define SKIPBACKFACE \
4419                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4420                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4421                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4422                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4423                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4424                 switch(cullface) \
4425                 { \
4426                 case GL_BACK: \
4427                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4428                                 continue; \
4429                         break; \
4430                 case GL_FRONT: \
4431                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4432                                 continue; \
4433                         break; \
4434                 }
4435                         //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4436                         //                                                _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4437                         //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4438                         //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4439                         //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4440
4441                         // macros for clipping vertices
4442
4443 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4444                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4445                         { \
4446                                 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4447                                 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4448                         }
4449 #define CLIPPEDVERTEXCOPY(k,p1) \
4450                         screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4451
4452 #define GENATTRIBCOPY(j, attrib, p1) \
4453                 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4454 #define GENATTRIBLERP(j, attrib, p1, p2) \
4455                 { \
4456                         __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4457                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4458                 }
4459 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4460                 switch(clipcase) \
4461                 { \
4462                 default: \
4463                 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4464                 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4465                 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4466                 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4467                 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4468                 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4469                 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4470                 }
4471
4472                 // calculate distance from nearplane
4473                 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4474                 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4475                 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4476                 if (clipdist[0] >= 0.0f)
4477                 {
4478                         if (clipdist[1] >= 0.0f)
4479                         {
4480                                 if (clipdist[2] >= 0.0f)
4481                                 {
4482                                         // triangle is entirely in front of nearplane
4483                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4484                                         SKIPBACKFACE;
4485                                         numpoints = 3;
4486                                         clipcase = 0;
4487                                 }
4488                                 else
4489                                 {
4490                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4491                                         SKIPBACKFACE;
4492                                         numpoints = 4;
4493                                         clipcase = 1;
4494                                 }
4495                         }
4496                         else 
4497                         {
4498                                 if (clipdist[2] >= 0.0f)
4499                                 {
4500                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2);     CLIPPEDVERTEXCOPY(3,2);
4501                                         SKIPBACKFACE;
4502                                         numpoints = 4;
4503                                         clipcase = 2;
4504                                 }
4505                                 else
4506                                 {
4507                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4508                                         SKIPBACKFACE;
4509                                         numpoints = 3;
4510                                         clipcase = 3;
4511                                 }
4512                         }
4513                 }                       
4514                 else if (clipdist[1] >= 0.0f)
4515                 {
4516                         if (clipdist[2] >= 0.0f)
4517                         {
4518                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4519                                 SKIPBACKFACE;
4520                                 numpoints = 4;
4521                                 clipcase = 4;
4522                         }
4523                         else
4524                         {
4525                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4526                                 SKIPBACKFACE;
4527                                 numpoints = 3;
4528                                 clipcase = 5;
4529                         }
4530                 }
4531                 else if (clipdist[2] >= 0.0f)
4532                 {
4533                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4534                         SKIPBACKFACE;
4535                         numpoints = 3;
4536                         clipcase = 6;
4537                 }
4538                 else continue; // triangle is entirely behind nearplane
4539
4540                 {
4541                         // calculate integer y coords for triangle points
4542                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4543                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)), 
4544                                         screenmin = _mm_min_epi16(screeni, screenir), 
4545                                         screenmax = _mm_max_epi16(screeni, screenir);
4546                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4547                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4548                         screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4549                         screenmax = _mm_min_epi16(screenmax, fbmax);
4550                         // skip offscreen triangles
4551                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4552                                 continue;
4553                         starty = _mm_extract_epi16(screenmin, 1);
4554                         endy = _mm_extract_epi16(screenmax, 1)+1;
4555                         screeny = _mm_srai_epi32(screeni, 16);
4556                 }
4557
4558                 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4559 #ifdef USETHREADS
4560                         DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4561 #else
4562                         DPSOFTRAST_Draw_FlushThreads();
4563 #endif
4564
4565                 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4566                 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4567                 triangle->starty = starty;
4568                 triangle->endy = endy;
4569                 triangle->numpoints = numpoints;
4570                 _mm_store_ps(triangle->coords[0], screen[0]);
4571                 _mm_store_ps(triangle->coords[1], screen[1]);
4572                 _mm_store_ps(triangle->coords[2], screen[2]);
4573                 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4574                 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4575
4576                 // calculate attribute plans for triangle data...
4577                 // okay, this triangle is going to produce spans, we'd better project
4578                 // the interpolants now (this is what gives perspective texturing),
4579                 // this consists of simply multiplying all arrays by the W coord
4580                 // (which is basically 1/Z), which will be undone per-pixel
4581                 // (multiplying by Z again) to get the perspective-correct array
4582                 // values
4583                 {
4584                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4585                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4586                 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4587                 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4588                 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4589                 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4590                 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4591                 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4592                 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4593                         attribedge1 = _mm_sub_ss(w0, w1);
4594                         attribedge2 = _mm_sub_ss(w2, w1);
4595                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4596                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4597                 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4598                 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4599                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4600                         _mm_store_ss(&triangle->w[0], attribxslope);
4601                         _mm_store_ss(&triangle->w[1], attribyslope);
4602                         _mm_store_ss(&triangle->w[2], attriborigin);
4603                         for (j = 0;j < numarrays;j++)
4604                         {
4605                                 if (arraymask[j])
4606                                 {
4607                                         __m128 attrib0, attrib1, attrib2;
4608                                         GENATTRIBS(j, attrib0, attrib1, attrib2);
4609                                         attriborigin = _mm_mul_ps(attrib1, w1);
4610                                         attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4611                                         attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4612                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4613                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4614                                         attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4615                                         _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4616                                         _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4617                                 _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4618                         }
4619                     }
4620             }
4621
4622                 // adjust texture LOD by texture density, in the simplest way possible...
4623                 {
4624                         __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4625                         memset(triangle->mip, 0, sizeof(triangle->mip));
4626                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4627                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4628                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4629                         k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4630                         GENATTRIBS(k, attrib0, attrib1, attrib2);
4631                         mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4632                         mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4633                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4634                         {
4635                                 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4636                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4637                                         break;
4638                                 texture = dpsoftrast.texbound[texunit];
4639                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4640                                 {
4641                                         mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4642                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4643                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4644                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4645                                         // this will be multiplied in the texturing routine by the texture resolution
4646                                         y = _mm_cvtss_si32(mipdensity);
4647                                         if (y > 0)
4648                                         {
4649                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4650                                                 if (y > texture->mipmaps - 1)
4651                                                         y = texture->mipmaps - 1;
4652                                                 triangle->mip[texunit] = y;
4653                                         }
4654                                 }
4655                         }
4656                 }
4657
4658             dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4659                 dpsoftrast.trianglepool.usedtriangles++;
4660
4661                 numqueued++;
4662                 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4663                 {
4664                         MEMORY_BARRIER;
4665                         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4666
4667 #ifdef USETHREADS
4668                         SDL_LockMutex(dpsoftrast.trianglemutex);
4669                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4670                         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4671 #else
4672                         DPSOFTRAST_Draw_FlushThreads();
4673 #endif
4674                         numqueued = 0;
4675                 }
4676         }
4677         if (numqueued > 0)
4678         {
4679                 MEMORY_BARRIER;
4680                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4681
4682 #ifdef USETHREADS
4683                 SDL_LockMutex(dpsoftrast.trianglemutex);
4684                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4685                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4686 #else
4687                 DPSOFTRAST_Draw_FlushThreads();
4688 #endif
4689         }
4690 #endif
4691 }
4692
4693 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4694 {
4695         int i;
4696         int lastarray = DPSOFTRAST_ARRAY_POSITION;
4697         unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4698         memset(arraymask, false, sizeof(arraymask));
4699         arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4700         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4701         {
4702                 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4703                 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4704                         break;
4705                 switch (arrayindex)
4706                 {
4707                         case DPSOFTRAST_ARRAY_POSITION:
4708                         case DPSOFTRAST_ARRAY_COLOR: 
4709                                 break;
4710                         default:
4711                                 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4712                                         continue;
4713                                 break;
4714                 }
4715                 arraymask[arrayindex] = true;
4716                 if (arrayindex > lastarray)
4717                         lastarray = arrayindex;
4718         }
4719         DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4720         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4721 //      DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4722         DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4723 }
4724
4725 void DPSOFTRAST_Flush(void)
4726 {
4727         DPSOFTRAST_Draw_SyncCommands();
4728         DPSOFTRAST_Draw_FlushThreads();
4729 }
4730
4731 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4732 {
4733         int i;
4734         union
4735         {
4736                 int i;
4737                 unsigned char b[4];
4738         }
4739         u;
4740         u.i = 1;
4741         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4742         dpsoftrast.bigendian = u.b[3];
4743         dpsoftrast.fb_width = width;
4744         dpsoftrast.fb_height = height;
4745         dpsoftrast.fb_depthpixels = depthpixels;
4746         dpsoftrast.fb_colorpixels[0] = colorpixels;
4747         dpsoftrast.fb_colorpixels[1] = NULL;
4748         dpsoftrast.fb_colorpixels[1] = NULL;
4749         dpsoftrast.fb_colorpixels[1] = NULL;
4750         dpsoftrast.texture_firstfree = 1;
4751         dpsoftrast.texture_end = 1;
4752         dpsoftrast.texture_max = 0;
4753         dpsoftrast.viewport[0] = 0;
4754         dpsoftrast.viewport[1] = 0;
4755         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4756         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4757         dpsoftrast.color[0] = 1;
4758         dpsoftrast.color[1] = 1;
4759         dpsoftrast.color[2] = 1;
4760         dpsoftrast.color[3] = 1;
4761         dpsoftrast.cullface = GL_BACK;
4762 #ifdef USETHREADS
4763         dpsoftrast.numthreads = bound(1, numthreads, 64);
4764         dpsoftrast.trianglemutex = SDL_CreateMutex();
4765         dpsoftrast.trianglecond = SDL_CreateCond();
4766 #else
4767         dpsoftrast.numthreads = 1;
4768 #endif
4769         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4770         for (i = 0; i < dpsoftrast.numthreads; i++)
4771         {
4772                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4773                 thread->index = i;
4774                 thread->colormask[1] = 1;
4775                 thread->colormask[2] = 1;
4776                 thread->colormask[3] = 1;
4777                 thread->blendfunc[0] = GL_ONE;
4778                 thread->blendfunc[1] = GL_ZERO;
4779                 thread->depthmask = true;
4780                 thread->depthtest = true;
4781                 thread->depthfunc = GL_LEQUAL;
4782                 thread->scissortest = false;
4783                 thread->alphatest = false;
4784                 thread->alphafunc = GL_GREATER;
4785                 thread->alphavalue = 0.5f;
4786                 thread->scissor[0] = 0;
4787                 thread->scissor[1] = 0;
4788                 thread->scissor[2] = dpsoftrast.fb_width;
4789                 thread->scissor[3] = dpsoftrast.fb_height;
4790                 thread->depthrange[0] = 0;
4791                 thread->depthrange[1] = 1;
4792                 thread->polygonoffset[0] = 0;
4793                 thread->polygonoffset[1] = 0;
4794
4795                 thread->numspans = 0;
4796                 thread->triangleoffset = 0;
4797                 thread->commandoffset = 0;
4798                 thread->waiting = false;
4799 #ifdef USETHREADS
4800                 thread->waitcond = SDL_CreateCond();
4801 #endif
4802
4803                 thread->validate = -1;
4804                 DPSOFTRAST_Validate(thread, -1);
4805 #ifdef USETHREADS
4806                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4807 #endif
4808         }
4809 }
4810
4811 void DPSOFTRAST_Shutdown(void)
4812 {
4813         int i;
4814 #ifdef USETHREADS
4815         if(dpsoftrast.numthreads > 0)
4816         {
4817                 DPSOFTRAST_State_Thread *thread;
4818                 SDL_LockMutex(dpsoftrast.trianglemutex);
4819                 for (i = 0; i < dpsoftrast.numthreads; i++)
4820                 {
4821                         thread = &dpsoftrast.threads[i];
4822                         thread->index = -1;
4823                 }
4824                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4825                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4826                 for (i = 0; i < dpsoftrast.numthreads; i++)
4827                 {
4828                         thread = &dpsoftrast.threads[i];
4829                         SDL_WaitThread(thread->thread, NULL);
4830                         SDL_DestroyCond(thread->waitcond);
4831                 }
4832                 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4833                 SDL_DestroyCond(dpsoftrast.trianglecond);
4834         }
4835 #endif
4836         for (i = 0;i < dpsoftrast.texture_end;i++)
4837                 if (dpsoftrast.texture[i].bytes)
4838                         MM_FREE(dpsoftrast.texture[i].bytes);
4839         if (dpsoftrast.texture)
4840                 free(dpsoftrast.texture);
4841         if (dpsoftrast.threads)
4842                 MM_FREE(dpsoftrast.threads);
4843         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4844 }
4845