]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
only force flushing on a texture update if a texture is already bound (for quake)
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifdef USE_THREADS
13 #include <SDL.h>
14 #include <SDL_thread.h>
15 #endif
16
17 #ifndef __cplusplus
18 typedef qboolean bool;
19 #endif
20
21 #define ALIGN_SIZE 16
22 #define ATOMIC_SIZE 32
23
24 #ifdef SSE2_PRESENT
25         #if defined(__GNUC__)
26                 #define ALIGN(var) var __attribute__((__aligned__(16)))
27                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
28                 #ifdef USE_THREADS
29                         #define MEMORY_BARRIER (_mm_sfence())
30                         //(__sync_synchronize())
31                         #define ATOMIC_COUNTER volatile int
32                         #define ATOMIC_ADD(counter, val) (__sync_add_and_fetch(&(counter), (val)))
33                 #endif
34         #elif defined(_MSC_VER)
35                 #define ALIGN(var) __declspec(align(16)) var
36                 #define ATOMIC(var) __declspec(align(32)) var
37                 #ifdef USE_THREADS
38                         #define MEMORY_BARRIER (_mm_sfence())
39                         //(MemoryBarrier())
40                         #define ATOMIC_COUNTER volatile LONG
41                         #define ATOMIC_ADD(counter, val) (InterlockedAdd(&(counter), (val)))
42                 #endif
43         #else
44                 #undef USE_THREADS
45                 #undef SSE2_PRESENT
46         #endif
47 #endif
48
49 #ifndef SSE2_PRESENT
50         #define ALIGN(var) var
51         #define ATOMIC(var) var
52 #endif
53
54 #ifndef USE_THREADS
55         #define MEMORY_BARRIER ((void)0)
56         #define ATOMIC_COUNTER int
57         #define ATOMIC_ADD(counter, val) ((counter) += (val))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if(ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         int opcode;
117 }
118 DPSOFTRAST_Command);
119
120 enum { DPSOFTRAST_OPCODE_Reset = 0 };
121
122 #define DEFCOMMAND(opcodeval, name, fields) \
123         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
124         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
125         { \
126                 int opcode; \
127                 fields \
128         } DPSOFTRAST_Command_##name );
129
130 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
131
132 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
133 {
134         int freecommand;
135         int usedcommands;
136         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
137 }
138 DPSOFTRAST_State_Command_Pool);
139
140 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
141 {
142         int commandoffset;
143         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
144         int starty;
145         int endy;
146         int numpoints;
147         float w[3];
148         ALIGN(float coords[4][4]);
149         ALIGN(int ycoords[4]);
150         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
151 }
152 DPSOFTRAST_State_Triangle);
153
154 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
155         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
156         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
157                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
158                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
159 }
160 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
161         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
162         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
163         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
164         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
165         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
166         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
167         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
168         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
169 }
170                                         
171 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
172
173 typedef ALIGN(struct DPSOFTRAST_State_Span_s
174 {
175         int triangle; // triangle this span was generated by
176         int x; // framebuffer x coord
177         int y; // framebuffer y coord
178         int length; // pixel count
179         int startx; // usable range (according to pixelmask)
180         int endx; // usable range (according to pixelmask)
181         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
182 }
183 DPSOFTRAST_State_Span);
184
185 #define DPSOFTRAST_DRAW_MAXSPANS 1024
186
187 #define DPSOFTRAST_DRAW_MAXTRIANGLEPOOL 4096
188 #define DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES 64
189
190 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_Pool_s
191 {
192         int freetriangle;
193         int usedtriangles;
194         ATOMIC(DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLEPOOL]);
195 }
196 DPSOFTRAST_State_Triangle_Pool);
197
198 #define DPSOFTRAST_VALIDATE_FB 1
199 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
200 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
201 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202
203 typedef enum DPSOFTRAST_BLENDMODE_e
204 {
205         DPSOFTRAST_BLENDMODE_OPAQUE,
206         DPSOFTRAST_BLENDMODE_ALPHA,
207         DPSOFTRAST_BLENDMODE_ADDALPHA,
208         DPSOFTRAST_BLENDMODE_ADD,
209         DPSOFTRAST_BLENDMODE_INVMOD,
210         DPSOFTRAST_BLENDMODE_MUL,
211         DPSOFTRAST_BLENDMODE_MUL2,
212         DPSOFTRAST_BLENDMODE_SUBALPHA,
213         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
214         DPSOFTRAST_BLENDMODE_TOTAL
215 }
216 DPSOFTRAST_BLENDMODE;
217
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
219 {
220 #ifdef USE_THREADS
221         SDL_Thread *thread;
222 #endif
223         int index;
224         
225         int colormask[4];
226         int blendfunc[2];
227         int blendsubtract;
228         int depthmask;
229         int depthtest;
230         int depthfunc;
231         int scissortest;
232         int alphatest;
233         int alphafunc;
234         float alphavalue;
235         int scissor[4];
236         int viewport[4];
237         float depthrange[2];
238         float polygonoffset[2];
239
240         int shader_mode;
241         int shader_permutation;
242
243         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
244         
245         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
247
248         // DPSOFTRAST_VALIDATE_ flags
249         int validate;
250
251         // derived values (DPSOFTRAST_VALIDATE_FB)
252         int fb_colormask;
253         int fb_clearscissor[4];
254
255         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
256         int fb_depthfunc;
257
258         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
259         int fb_blendmode;
260
261         ATOMIC(int commandoffset);
262         int triangleoffset;
263
264         bool waiting;
265 #ifdef USE_THREADS
266         SDL_cond *waitcond;
267 #endif
268
269         int numspans;
270         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
271 }
272 DPSOFTRAST_State_Thread);
273
274 typedef ATOMIC(struct DPSOFTRAST_State_s
275 {
276         int fb_width;
277         int fb_height;
278         unsigned int *fb_depthpixels;
279         unsigned int *fb_colorpixels[4];
280
281         int viewport[4];
282         ALIGN(float fb_viewportcenter[4]);
283         ALIGN(float fb_viewportscale[4]);
284
285         float color[4];
286         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
288
289         int cullface;
290
291         const float *pointer_vertex3f;
292         const float *pointer_color4f;
293         const unsigned char *pointer_color4ub;
294         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
295         int stride_vertex;
296         int stride_color;
297         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
298         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
300
301         int numvertices;
302         int maxvertices;
303         float *in_array4f[DPSOFTRAST_ARRAY_TOTAL];
304         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305         float *screencoord4f;
306
307         int shader_mode;
308         int shader_permutation;
309
310         int texture_max;
311         int texture_end;
312         int texture_firstfree;
313         DPSOFTRAST_Texture *texture;
314
315         int bigendian;
316
317         // error reporting
318         const char *errorstring;
319
320         int numthreads;
321         DPSOFTRAST_State_Thread *threads;
322 #ifdef USE_THREADS
323         SDL_mutex *trianglemutex;
324         SDL_cond *trianglecond;
325 #endif
326
327         ATOMIC(int drawtriangle);
328
329         DPSOFTRAST_State_Command_Pool commandpool;
330         DPSOFTRAST_State_Triangle_Pool trianglepool;
331 }
332 DPSOFTRAST_State);
333
334 DPSOFTRAST_State dpsoftrast;
335
336 extern int dpsoftrast_test;
337
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
343
344 void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
345 {
346         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
347         // and viewport projection values
348         int x1, x2;
349         int y1, y2;
350         x1 = thread->scissor[0];
351         x2 = thread->scissor[0] + thread->scissor[2];
352         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
353         y2 = dpsoftrast.fb_height - thread->scissor[1];
354         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
355         if (x1 < 0) x1 = 0;
356         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
357         if (y1 < 0) y1 = 0;
358         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
359         thread->fb_clearscissor[0] = x1;
360         thread->fb_clearscissor[1] = y1;
361         thread->fb_clearscissor[2] = x2 - x1;
362         thread->fb_clearscissor[3] = y2 - y1;
363 }
364
365 void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
366 {
367         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
368 }
369
370 void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         if (thread->blendsubtract)
373         {
374                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
375                 {
376                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
377                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
378                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
379                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
380                 }
381         }
382         else
383         {       
384                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
385                 {
386                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
387                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
388                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
389                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
390                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
391                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
392                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
393                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
394                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
395                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
396                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
397                 }
398         }
399 }
400
401 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
402
403 void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
404 {
405         mask &= thread->validate;
406         if (!mask)
407                 return;
408         if (mask & DPSOFTRAST_VALIDATE_FB)
409         {
410                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
411                 DPSOFTRAST_RecalcFB(thread);
412         }
413         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
416                 DPSOFTRAST_RecalcDepthFunc(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
421                 DPSOFTRAST_RecalcBlendFunc(thread);
422         }
423 }
424
425 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
426 {
427         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
428                 return &dpsoftrast.texture[index];
429         return NULL;
430 }
431
432 static void DPSOFTRAST_Texture_Grow(void)
433 {
434         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
435         DPSOFTRAST_State_Thread *thread;
436         int i;
437         int j;
438         DPSOFTRAST_Flush();
439         // expand texture array as needed
440         if (dpsoftrast.texture_max < 1024)
441                 dpsoftrast.texture_max = 1024;
442         else
443                 dpsoftrast.texture_max *= 2;
444         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
445         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
446                 if(dpsoftrast.texbound[i])
447                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
448         for (j = 0; j < dpsoftrast.numthreads; j++)
449         {
450                 thread = &dpsoftrast.threads[j];
451                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
452                         if(thread->texbound[i])
453                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
454         }
455 }
456
457 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
458 {
459         int w;
460         int h;
461         int d;
462         int size;
463         int s;
464         int texnum;
465         int mipmaps;
466         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
467         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
468         DPSOFTRAST_Texture *texture;
469         if (width*height*depth < 1)
470         {
471                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
472                 return 0;
473         }
474         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
477                 return 0;
478         }
479         switch(texformat)
480         {
481         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
482         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
483         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
484                 break;
485         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
486                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
487                 {
488                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
489                         return 0;
490                 }
491                 if (depth != 1)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
499                         return 0;
500                 }
501                 break;
502         }
503         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
504         {
505                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
506                 return 0;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
521                 return 0;
522         }
523         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
526                 return 0;
527         }
528         // find first empty slot in texture array
529         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
530                 if (!dpsoftrast.texture[texnum].bytes)
531                         break;
532         dpsoftrast.texture_firstfree = texnum + 1;
533         if (dpsoftrast.texture_max <= texnum)
534                 DPSOFTRAST_Texture_Grow();
535         if (dpsoftrast.texture_end <= texnum)
536                 dpsoftrast.texture_end = texnum + 1;
537         texture = &dpsoftrast.texture[texnum];
538         memset(texture, 0, sizeof(*texture));
539         texture->flags = flags;
540         texture->width = width;
541         texture->height = height;
542         texture->depth = depth;
543         texture->sides = sides;
544         texture->binds = 0;
545         w = width;
546         h = height;
547         d = depth;
548         size = 0;
549         mipmaps = 0;
550         w = width;
551         h = height;
552         d = depth;
553         for (;;)
554         {
555                 s = w * h * d * sides * 4;
556                 texture->mipmap[mipmaps][0] = size;
557                 texture->mipmap[mipmaps][1] = s;
558                 texture->mipmap[mipmaps][2] = w;
559                 texture->mipmap[mipmaps][3] = h;
560                 texture->mipmap[mipmaps][4] = d;
561                 size += s;
562                 mipmaps++;
563                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564                         break;
565                 if (w > 1) w >>= 1;
566                 if (h > 1) h >>= 1;
567                 if (d > 1) d >>= 1;
568         }
569         texture->mipmaps = mipmaps;
570         texture->size = size;
571
572         // allocate the pixels now
573         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
574
575         return texnum;
576 }
577 void DPSOFTRAST_Texture_Free(int index)
578 {
579         DPSOFTRAST_Texture *texture;
580         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
581         if (texture->binds)
582                 DPSOFTRAST_Flush();
583         if (texture->bytes)
584                 MM_FREE(texture->bytes);
585         texture->bytes = NULL;
586         memset(texture, 0, sizeof(*texture));
587         // adjust the free range and used range
588         if (dpsoftrast.texture_firstfree > index)
589                 dpsoftrast.texture_firstfree = index;
590         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
591                 dpsoftrast.texture_end--;
592 }
593 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
594 {
595         int i, x, y, z, w, layer0, layer1, row0, row1;
596         unsigned char *o, *i0, *i1, *i2, *i3;
597         DPSOFTRAST_Texture *texture;
598         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
599         if (texture->mipmaps <= 1)
600                 return;
601         for (i = 1;i < texture->mipmaps;i++)
602         {
603                 for (z = 0;z < texture->mipmap[i][4];z++)
604                 {
605                         layer0 = z*2;
606                         layer1 = z*2+1;
607                         if (layer1 >= texture->mipmap[i-1][4])
608                                 layer1 = texture->mipmap[i-1][4]-1;
609                         for (y = 0;y < texture->mipmap[i][3];y++)
610                         {
611                                 row0 = y*2;
612                                 row1 = y*2+1;
613                                 if (row1 >= texture->mipmap[i-1][3])
614                                         row1 = texture->mipmap[i-1][3]-1;
615                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
616                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
617                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
618                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
619                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
620                                 w = texture->mipmap[i][2];
621                                 if (layer1 > layer0)
622                                 {
623                                         if (texture->mipmap[i-1][2] > 1)
624                                         {
625                                                 // average 3D texture
626                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
627                                                 {
628                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
629                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
630                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
631                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
632                                                 }
633                                         }
634                                         else
635                                         {
636                                                 // average 3D mipmap with parent width == 1
637                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
638                                                 {
639                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
640                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
641                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
642                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
643                                                 }
644                                         }
645                                 }
646                                 else
647                                 {
648                                         if (texture->mipmap[i-1][2] > 1)
649                                         {
650                                                 // average 2D texture (common case)
651                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
652                                                 {
653                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
654                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
655                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
656                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
657                                                 }
658                                         }
659                                         else
660                                         {
661                                                 // 2D texture with parent width == 1
662                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
663                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
664                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
665                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
666                                         }
667                                 }
668                         }
669                 }
670         }
671 }
672 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
673 {
674         DPSOFTRAST_Texture *texture;
675         unsigned char *dst;
676         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
677         if (texture->binds)
678                 DPSOFTRAST_Flush();
679         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
680         while (blockheight > 0)
681         {
682                 memcpy(dst, pixels, blockwidth * 4);
683                 pixels += blockwidth * 4;
684                 dst += texture->mipmap[0][2] * 4;
685                 blockheight--;
686         }
687         DPSOFTRAST_Texture_CalculateMipmaps(index);
688 }
689 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
690 {
691         DPSOFTRAST_Texture *texture;
692         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693         if (texture->binds)
694                 DPSOFTRAST_Flush();
695         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
696         DPSOFTRAST_Texture_CalculateMipmaps(index);
697 }
698 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
699 {
700         DPSOFTRAST_Texture *texture;
701         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
702         return texture->mipmap[mip][2];
703 }
704 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
705 {
706         DPSOFTRAST_Texture *texture;
707         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
708         return texture->mipmap[mip][3];
709 }
710 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
711 {
712         DPSOFTRAST_Texture *texture;
713         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
714         return texture->mipmap[mip][4];
715 }
716 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
717 {
718         DPSOFTRAST_Texture *texture;
719         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
720         if (texture->binds)
721                 DPSOFTRAST_Flush();
722         return texture->bytes + texture->mipmap[mip][0];
723 }
724 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
728         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
729         {
730                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
731                 return;
732         }
733         if (texture->binds)
734                 DPSOFTRAST_Flush();
735         texture->filter = filter;
736 }
737
738 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
739 {
740         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
741                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
742                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
743                 DPSOFTRAST_Flush();
744         dpsoftrast.fb_width = width;
745         dpsoftrast.fb_height = height;
746         dpsoftrast.fb_depthpixels = depthpixels;
747         dpsoftrast.fb_colorpixels[0] = colorpixels0;
748         dpsoftrast.fb_colorpixels[1] = colorpixels1;
749         dpsoftrast.fb_colorpixels[2] = colorpixels2;
750         dpsoftrast.fb_colorpixels[3] = colorpixels3;
751 }
752
753 void DPSOFTRAST_Draw_FlushThreads(void);
754
755 void DPSOFTRAST_Draw_FreeTrianglePool(int space)
756 {
757         DPSOFTRAST_State_Thread *thread;
758         int i;
759         int freetriangle = dpsoftrast.trianglepool.freetriangle;
760         int usedtriangles = dpsoftrast.trianglepool.usedtriangles;
761         if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space)
762                 return;
763 #ifdef USE_THREADS
764         SDL_LockMutex(dpsoftrast.trianglemutex);
765 #endif
766         for(;;)
767         {
768                 int waitindex = -1;
769                 int triangleoffset;
770                 usedtriangles = 0;
771                 for (i = 0; i < dpsoftrast.numthreads; i++)
772                 {
773                         thread = &dpsoftrast.threads[i];
774                         triangleoffset = freetriangle - thread->triangleoffset;
775                         if (triangleoffset < 0)
776                                 triangleoffset += DPSOFTRAST_DRAW_MAXTRIANGLEPOOL;
777                         if (triangleoffset > usedtriangles)
778                         {
779                                 waitindex = i;
780                                 usedtriangles = triangleoffset;
781                         }
782                 }
783                 if (usedtriangles <= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-space || waitindex < 0)
784                         break;
785 #ifdef USE_THREADS
786                 thread = &dpsoftrast.threads[waitindex];
787                 thread->waiting = true;
788                 SDL_CondBroadcast(dpsoftrast.trianglecond);
789                 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
790                 thread->waiting = false;
791 #endif
792         }
793 #ifdef USE_THREADS
794         SDL_UnlockMutex(dpsoftrast.trianglemutex);
795 #endif
796         dpsoftrast.trianglepool.usedtriangles = usedtriangles;
797 }
798
799 void DPSOFTRAST_Draw_SyncCommands(void)
800 {
801         DPSOFTRAST_State_Triangle *triangle;
802         if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
803 #ifdef USE_THREADS
804                 DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
805 #else
806                 DPSOFTRAST_Draw_FlushThreads();
807 #endif
808         triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
809         triangle->commandoffset = dpsoftrast.commandpool.freecommand;
810         triangle->starty = -1;
811         triangle->endy = -1;
812         dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
813         dpsoftrast.trianglepool.usedtriangles++;
814         MEMORY_BARRIER;
815         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
816 }
817
818 void DPSOFTRAST_Draw_FreeCommandPool(int space)
819 {
820         DPSOFTRAST_State_Thread *thread;
821         int i;
822         int freecommand = dpsoftrast.commandpool.freecommand;
823         int usedcommands = dpsoftrast.commandpool.usedcommands;
824         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
825                 return;
826         DPSOFTRAST_Draw_SyncCommands();
827 #ifdef USE_THREADS
828         SDL_LockMutex(dpsoftrast.trianglemutex);
829 #endif
830         for(;;)
831         {
832                 int waitindex = -1;
833                 int commandoffset;
834                 usedcommands = 0;
835                 for (i = 0; i < dpsoftrast.numthreads; i++)
836                 {
837                         thread = &dpsoftrast.threads[i]; 
838                         commandoffset = freecommand - thread->commandoffset;
839                         if (commandoffset < 0)
840                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
841                         if (commandoffset > usedcommands)
842                         {
843                                 waitindex = i;
844                                 usedcommands = commandoffset;
845                         }
846                 }
847                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
848                         break;
849 #ifdef USE_THREADS
850                 thread = &dpsoftrast.threads[waitindex];
851                 thread->waiting = true;
852                 SDL_CondBroadcast(dpsoftrast.trianglecond);
853                 SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
854                 thread->waiting = false;
855 #endif
856         }
857 #ifdef USE_THREADS
858         SDL_UnlockMutex(dpsoftrast.trianglemutex);
859 #endif
860         dpsoftrast.commandpool.usedcommands = usedcommands;
861 }
862
863 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
864         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand(sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1))))
865
866 static void *DPSOFTRAST_AllocateCommand(int size)
867 {
868         DPSOFTRAST_Command *command;
869         int freecommand = dpsoftrast.commandpool.freecommand;
870         int usedcommands = dpsoftrast.commandpool.usedcommands;
871         int extra = sizeof(DPSOFTRAST_Command);
872         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
873                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
875         {
876 #ifdef USE_THREADS
877                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
878 #else
879                 DPSOFTRAST_Draw_FlushThreads();
880 #endif
881                 freecommand = dpsoftrast.commandpool.freecommand;
882                 usedcommands = dpsoftrast.commandpool.usedcommands;
883         }
884         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
885         {
886                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887                 command->opcode = DPSOFTRAST_OPCODE_Reset;
888                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889                 freecommand = 0;
890         }
891         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892         freecommand += size;
893         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
894                 freecommand = 0;
895
896         dpsoftrast.commandpool.freecommand = freecommand;
897         dpsoftrast.commandpool.usedcommands = usedcommands + size;
898         return command;
899 }
900         
901 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
902 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
903 {
904         thread->viewport[0] = command->x;
905         thread->viewport[1] = command->y;
906         thread->viewport[2] = command->width;
907         thread->viewport[3] = command->height;
908         thread->validate |= DPSOFTRAST_VALIDATE_FB;
909 }
910 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
911 {
912         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913         command->opcode = DPSOFTRAST_OPCODE_Viewport;
914         command->x = x;
915         command->y = y;
916         command->width = width;
917         command->height = height;
918
919         dpsoftrast.viewport[0] = x;
920         dpsoftrast.viewport[1] = y;
921         dpsoftrast.viewport[2] = width;
922         dpsoftrast.viewport[3] = height;
923         dpsoftrast.fb_viewportcenter[1] = dpsoftrast.viewport[0] + 0.5f * dpsoftrast.viewport[2] - 0.5f;
924         dpsoftrast.fb_viewportcenter[2] = dpsoftrast.fb_height - dpsoftrast.viewport[1] - 0.5f * dpsoftrast.viewport[3] - 0.5f;
925         dpsoftrast.fb_viewportcenter[3] = 0.5f;
926         dpsoftrast.fb_viewportcenter[0] = 0.0f;
927         dpsoftrast.fb_viewportscale[1] = 0.5f * dpsoftrast.viewport[2];
928         dpsoftrast.fb_viewportscale[2] = -0.5f * dpsoftrast.viewport[3];
929         dpsoftrast.fb_viewportscale[3] = 0.5f;
930         dpsoftrast.fb_viewportscale[0] = 1.0f;
931 }
932
933 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
934 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
935 {
936         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
937         unsigned int *p;
938         unsigned int c;
939         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
940         x1 = thread->fb_clearscissor[0];
941         y1 = thread->fb_clearscissor[1];
942         x2 = thread->fb_clearscissor[2];
943         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
944         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
945         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
946         if(y1 < t1) y1 = t1;
947         if(y2 > t2) y2 = t2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         // FIXME: honor fb_colormask?
953         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
954         for (i = 0;i < 4;i++)
955         {
956                 if (!dpsoftrast.fb_colorpixels[i])
957                         continue;
958                 for (y = y1;y < y2;y++)
959                 {
960                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
961                         for (x = x1;x < x2;x++)
962                                 p[x] = c;
963                 }
964         }
965 }
966 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
967 {
968         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
969         command->opcode = DPSOFTRAST_OPCODE_ClearColor;
970         command->r = r;
971         command->g = g;
972         command->b = b;
973         command->a = a;
974 }
975
976 DEFCOMMAND(3, ClearDepth, float depth;)
977 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
978 {
979         int x1, y1, x2, y2, w, h, x, y, t1, t2;
980         unsigned int *p;
981         unsigned int c;
982         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983         x1 = thread->fb_clearscissor[0];
984         y1 = thread->fb_clearscissor[1];
985         x2 = thread->fb_clearscissor[2];
986         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
987         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
988         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
989         if(y1 < t1) y1 = t1;
990         if(y2 > t2) y2 = t2;
991         w = x2 - x1;
992         h = y2 - y1;
993         if (w < 1 || h < 1)
994                 return;
995         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
996         for (y = y1;y < y2;y++)
997         {
998                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
999                 for (x = x1;x < x2;x++)
1000                         p[x] = c;
1001         }
1002 }
1003 void DPSOFTRAST_ClearDepth(float d)
1004 {
1005         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1006         command->opcode = DPSOFTRAST_OPCODE_ClearDepth;
1007         command->depth = d;
1008 }
1009
1010 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1011 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1012 {
1013         thread->colormask[0] = command->r != 0;
1014         thread->colormask[1] = command->g != 0;
1015         thread->colormask[2] = command->b != 0;
1016         thread->colormask[3] = command->a != 0;
1017         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1018 }
1019 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1020 {
1021         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1022         command->opcode = DPSOFTRAST_OPCODE_ColorMask;
1023         command->r = r;
1024         command->g = g;
1025         command->b = b;
1026         command->a = a;
1027 }
1028
1029 DEFCOMMAND(5, DepthTest, int enable;)
1030 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1031 {
1032         thread->depthtest = command->enable;
1033         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1034 }
1035 void DPSOFTRAST_DepthTest(int enable)
1036 {
1037         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1038         command->opcode = DPSOFTRAST_OPCODE_DepthTest;
1039         command->enable = enable;
1040 }
1041
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1044 {
1045         thread->scissortest = command->enable;
1046         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1047 }
1048 void DPSOFTRAST_ScissorTest(int enable)
1049 {
1050         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051         command->opcode = DPSOFTRAST_OPCODE_ScissorTest;
1052         command->enable = enable;
1053 }
1054
1055 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1056 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1057 {
1058         thread->scissor[0] = command->x;
1059         thread->scissor[1] = command->y;
1060         thread->scissor[2] = command->width;
1061         thread->scissor[3] = command->height;
1062         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1063 }
1064 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1065 {
1066         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1067         command->opcode = DPSOFTRAST_OPCODE_Scissor;
1068         command->x = x;
1069         command->y = y;
1070         command->width = width;
1071         command->height = height;
1072 }
1073
1074 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1075 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1076 {
1077         thread->blendfunc[0] = command->sfactor;
1078         thread->blendfunc[1] = command->dfactor;
1079         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1080 }
1081 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1082 {
1083         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1084         command->opcode = DPSOFTRAST_OPCODE_BlendFunc;
1085         command->sfactor = sfactor;
1086         command->dfactor = dfactor;
1087 }
1088
1089 DEFCOMMAND(9, BlendSubtract, int enable;)
1090 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1091 {
1092         thread->blendsubtract = command->enable;
1093         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1094 }
1095 void DPSOFTRAST_BlendSubtract(int enable)
1096 {
1097         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1098         command->opcode = DPSOFTRAST_OPCODE_BlendSubtract;
1099         command->enable = enable;
1100 }
1101
1102 DEFCOMMAND(10, DepthMask, int enable;)
1103 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1104 {
1105         thread->depthmask = command->enable;
1106 }
1107 void DPSOFTRAST_DepthMask(int enable)
1108 {
1109         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1110         command->opcode = DPSOFTRAST_OPCODE_DepthMask;
1111         command->enable = enable;
1112 }
1113
1114 DEFCOMMAND(11, DepthFunc, int func;)
1115 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1116 {
1117         thread->depthfunc = command->func;
1118 }
1119 void DPSOFTRAST_DepthFunc(int func)
1120 {
1121         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1122         command->opcode = DPSOFTRAST_OPCODE_DepthFunc;
1123         command->func = func;
1124 }
1125
1126 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1127 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1128 {
1129         thread->depthrange[0] = command->nearval;
1130         thread->depthrange[1] = command->farval;
1131 }
1132 void DPSOFTRAST_DepthRange(float nearval, float farval)
1133 {
1134         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1135         command->opcode = DPSOFTRAST_OPCODE_DepthRange;
1136         command->nearval = nearval;
1137         command->farval = farval;
1138 }
1139
1140 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1141 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1142 {
1143         thread->polygonoffset[0] = command->alongnormal;
1144         thread->polygonoffset[1] = command->intoview;
1145 }
1146 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1147 {
1148         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1149         command->opcode = DPSOFTRAST_OPCODE_PolygonOffset;
1150         command->alongnormal = alongnormal;
1151         command->intoview = intoview;
1152 }
1153
1154 void DPSOFTRAST_CullFace(int mode)
1155 {
1156         dpsoftrast.cullface = mode;
1157 }
1158
1159 DEFCOMMAND(15, AlphaTest, int enable;)
1160 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1161 {
1162         thread->alphatest = command->enable;
1163 }
1164 void DPSOFTRAST_AlphaTest(int enable)
1165 {
1166         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1167         command->opcode = DPSOFTRAST_OPCODE_AlphaTest;
1168         command->enable = enable;
1169 }
1170
1171 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1172 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1173 {
1174         thread->alphafunc = command->func;
1175         thread->alphavalue = command->ref;
1176 }
1177 void DPSOFTRAST_AlphaFunc(int func, float ref)
1178 {
1179         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1180         command->opcode = DPSOFTRAST_OPCODE_AlphaFunc;
1181         command->func = func;
1182         command->ref = ref;
1183 }
1184
1185 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1186 {
1187         dpsoftrast.color[0] = r;
1188         dpsoftrast.color[1] = g;
1189         dpsoftrast.color[2] = b;
1190         dpsoftrast.color[3] = a;
1191 }
1192
1193 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1194 {
1195         int outstride = blockwidth * 4;
1196         int instride = dpsoftrast.fb_width * 4;
1197         int bx1 = blockx;
1198         int by1 = blocky;
1199         int bx2 = blockx + blockwidth;
1200         int by2 = blocky + blockheight;
1201         int bw;
1202         int bh;
1203         int x;
1204         int y;
1205         unsigned char *inpixels;
1206         unsigned char *b;
1207         unsigned char *o;
1208         DPSOFTRAST_Flush();
1209         if (bx1 < 0) bx1 = 0;
1210         if (by1 < 0) by1 = 0;
1211         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1212         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1213         bw = bx2 - bx1;
1214         bh = by2 - by1;
1215         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1216         if (dpsoftrast.bigendian)
1217         {
1218                 for (y = by1;y < by2;y++)
1219                 {
1220                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1221                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1222                         for (x = bx1;x < bx2;x++)
1223                         {
1224                                 o[0] = b[3];
1225                                 o[1] = b[2];
1226                                 o[2] = b[1];
1227                                 o[3] = b[0];
1228                                 o += 4;
1229                                 b += 4;
1230                         }
1231                 }
1232         }
1233         else
1234         {
1235                 for (y = by1;y < by2;y++)
1236                 {
1237                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1238                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1239                         memcpy(o, b, bw*4);
1240                 }
1241         }
1242
1243 }
1244 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1245 {
1246         int tx1 = tx;
1247         int ty1 = ty;
1248         int tx2 = tx + width;
1249         int ty2 = ty + height;
1250         int sx1 = sx;
1251         int sy1 = sy;
1252         int sx2 = sx + width;
1253         int sy2 = sy + height;
1254         int swidth;
1255         int sheight;
1256         int twidth;
1257         int theight;
1258         int sw;
1259         int sh;
1260         int tw;
1261         int th;
1262         int y;
1263         unsigned int *spixels;
1264         unsigned int *tpixels;
1265         DPSOFTRAST_Texture *texture;
1266         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1267         if (mip < 0 || mip >= texture->mipmaps) return;
1268         if (texture->binds)
1269                 DPSOFTRAST_Flush();
1270         spixels = dpsoftrast.fb_colorpixels[0];
1271         swidth = dpsoftrast.fb_width;
1272         sheight = dpsoftrast.fb_height;
1273         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1274         twidth = texture->mipmap[mip][2];
1275         theight = texture->mipmap[mip][3];
1276         if (tx1 < 0) tx1 = 0;
1277         if (ty1 < 0) ty1 = 0;
1278         if (tx2 > twidth) tx2 = twidth;
1279         if (ty2 > theight) ty2 = theight;
1280         if (sx1 < 0) sx1 = 0;
1281         if (sy1 < 0) sy1 = 0;
1282         if (sx2 > swidth) sx2 = swidth;
1283         if (sy2 > sheight) sy2 = sheight;
1284         tw = tx2 - tx1;
1285         th = ty2 - ty1;
1286         sw = sx2 - sx1;
1287         sh = sy2 - sy1;
1288         if (tw > sw) tw = sw;
1289         if (th > sh) th = sh;
1290         if (tw < 1 || th < 1)
1291                 return;
1292         for (y = 0;y < th;y++)
1293                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1294         if (texture->mipmaps > 1)
1295                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1296 }
1297
1298 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1299 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1300 {
1301         if (thread->texbound[command->unitnum])
1302                 ATOMIC_ADD(thread->texbound[command->unitnum]->binds, -1);
1303         thread->texbound[command->unitnum] = command->texture;
1304 }
1305 void DPSOFTRAST_SetTexture(int unitnum, int index)
1306 {
1307         DPSOFTRAST_Command_SetTexture *command;
1308         DPSOFTRAST_Texture *texture;
1309         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1310         {
1311                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1312                 return;
1313         }
1314         texture = DPSOFTRAST_Texture_GetByIndex(index);
1315         if (index && !texture)
1316         {
1317                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1318                 return;
1319         }
1320
1321         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1322         command->opcode = DPSOFTRAST_OPCODE_SetTexture;
1323         command->unitnum = unitnum;
1324         command->texture = texture;
1325
1326         dpsoftrast.texbound[unitnum] = texture;
1327         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1328 }
1329
1330 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1331 {
1332         dpsoftrast.pointer_vertex3f = vertex3f;
1333         dpsoftrast.stride_vertex = stride;
1334 }
1335 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1336 {
1337         dpsoftrast.pointer_color4f = color4f;
1338         dpsoftrast.pointer_color4ub = NULL;
1339         dpsoftrast.stride_color = stride;
1340 }
1341 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1342 {
1343         dpsoftrast.pointer_color4f = NULL;
1344         dpsoftrast.pointer_color4ub = color4ub;
1345         dpsoftrast.stride_color = stride;
1346 }
1347 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1348 {
1349         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1350         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1351         dpsoftrast.stride_texcoord[unitnum] = stride;
1352 }
1353
1354 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1355 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1356 {
1357         thread->shader_mode = command->mode;
1358         thread->shader_permutation = command->permutation;
1359 }
1360 void DPSOFTRAST_SetShader(int mode, int permutation)
1361 {
1362         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1363         command->opcode = DPSOFTRAST_OPCODE_SetShader;
1364         command->mode = mode;
1365         command->permutation = permutation;
1366
1367         dpsoftrast.shader_mode = mode;
1368         dpsoftrast.shader_permutation = permutation;
1369 }
1370
1371 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1372 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1373 {
1374         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1375 }
1376 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1377 {
1378         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1379         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1380         command->index = index;
1381         command->val[0] = v0;
1382         command->val[1] = v1;
1383         command->val[2] = v2;
1384         command->val[3] = v3;
1385
1386         dpsoftrast.uniform4f[index*4+0] = v0;
1387         dpsoftrast.uniform4f[index*4+1] = v1;
1388         dpsoftrast.uniform4f[index*4+2] = v2;
1389         dpsoftrast.uniform4f[index*4+3] = v3;
1390 }
1391 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1392 {
1393         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1394         command->opcode = DPSOFTRAST_OPCODE_Uniform4f;
1395         command->index = index;
1396         memcpy(command->val, v, sizeof(command->val));
1397
1398         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1399 }
1400
1401 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1402 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1403 {
1404         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1405 }
1406 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1407 {
1408 #ifdef SSE2_PRESENT
1409         int i, index;
1410         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1411         {
1412                 __m128 m0, m1, m2, m3;
1413                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1414                 command->opcode = DPSOFTRAST_OPCODE_UniformMatrix4f;
1415                 command->index = index;
1416                 if (((size_t)v)&(ALIGN_SIZE-1))
1417                 {
1418                         m0 = _mm_loadu_ps(v);
1419                         m1 = _mm_loadu_ps(v+4);
1420                         m2 = _mm_loadu_ps(v+8);
1421                         m3 = _mm_loadu_ps(v+12);
1422                 }
1423                 else
1424                 {
1425                         m0 = _mm_load_ps(v);
1426                         m1 = _mm_load_ps(v+4);
1427                         m2 = _mm_load_ps(v+8);
1428                         m3 = _mm_load_ps(v+12);
1429                 }
1430                 if (transpose)
1431                 {
1432                         __m128 t0, t1, t2, t3;
1433                         t0 = _mm_unpacklo_ps(m0, m1);
1434                         t1 = _mm_unpacklo_ps(m2, m3);
1435                         t2 = _mm_unpackhi_ps(m0, m1);
1436                         t3 = _mm_unpackhi_ps(m2, m3);
1437                         m0 = _mm_movelh_ps(t0, t1);
1438                         m1 = _mm_movehl_ps(t1, t0);
1439                         m2 = _mm_movelh_ps(t2, t3);
1440                         m3 = _mm_movehl_ps(t3, t2);                     
1441                 }
1442                 _mm_store_ps(command->val, m0);
1443                 _mm_store_ps(command->val+4, m1);
1444                 _mm_store_ps(command->val+8, m2);
1445                 _mm_store_ps(command->val+12, m3);
1446                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1447                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1448                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1449                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1450         }
1451 #endif
1452 }
1453
1454 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1455 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1456 {
1457         thread->uniform1i[command->index] = command->val;
1458 }
1459 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1460 {
1461         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1462         command->opcode = DPSOFTRAST_OPCODE_Uniform1i;
1463         command->index = index;
1464         command->val = i0;
1465
1466         dpsoftrast.uniform1i[command->index] = i0;
1467 }
1468
1469 #ifdef SSE2_PRESENT
1470 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1471 {
1472         float *end = dst + size*4;
1473         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1474         {
1475                 while (dst < end)
1476                 {
1477                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1478                         dst += 4;
1479                         src += stride;
1480                 }
1481         }
1482         else
1483         {
1484                 while (dst < end)
1485                 {
1486                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1487                         dst += 4;
1488                         src += stride;
1489                 }
1490         }
1491 }
1492
1493 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1494 {
1495         float *end = dst + size*4;
1496         if (stride == sizeof(float[3]))
1497         {
1498                 float *end4 = dst + (size&~3)*4;        
1499                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1500                 {
1501                         while (dst < end4)
1502                         {
1503                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1504                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1505                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1508                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1511                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1512                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dst += 16;
1517                                 src += 4*sizeof(float[3]);
1518                         }
1519                 }
1520                 else
1521                 {
1522                         while (dst < end4)
1523                         {
1524                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1525                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1526                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1529                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1532                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1533                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dst += 16;
1538                                 src += 4*sizeof(float[3]);
1539                         }
1540                 }
1541         }
1542         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1543         {
1544                 while (dst < end)
1545                 {
1546                         __m128 v = _mm_loadu_ps((const float *)src);
1547                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1548                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1550                         _mm_store_ps(dst, v);
1551                         dst += 4;
1552                         src += stride;
1553                 }
1554         }
1555         else
1556         {
1557                 while (dst < end)
1558                 {
1559                         __m128 v = _mm_load_ps((const float *)src);
1560                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1561                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1563                         _mm_store_ps(dst, v);
1564                         dst += 4;
1565                         src += stride;
1566                 }
1567         }
1568 }
1569
1570 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1571 {
1572         float *end = dst + size*4;
1573         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1574         if (stride == sizeof(float[2]))
1575         {
1576                 float *end2 = dst + (size&~1)*4;
1577                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1578                 {
1579                         while (dst < end2)
1580                         {
1581                                 __m128 v = _mm_loadu_ps((const float *)src);
1582                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1584                                 dst += 8;
1585                                 src += 2*sizeof(float[2]);
1586                         }
1587                 }
1588                 else
1589                 {
1590                         while (dst < end2)
1591                         {
1592                                 __m128 v = _mm_load_ps((const float *)src);
1593                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1594                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1595                                 dst += 8;
1596                                 src += 2*sizeof(float[2]);
1597                         }
1598                 }
1599         }
1600         while (dst < end)
1601         {
1602                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1603                 dst += 4;
1604                 src += stride;
1605         }
1606 }
1607
1608 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1609 {
1610         float *end = dst + size*4;
1611         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1612         if (stride == sizeof(unsigned char[4]))
1613         {
1614                 float *end4 = dst + (size&~3)*4;
1615                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1616                 {
1617                         while (dst < end4)
1618                         {
1619                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1620                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1621                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1622                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1624                                 dst += 16;
1625                                 src += 4*sizeof(unsigned char[4]);
1626                         }
1627                 }
1628                 else
1629                 {
1630                         while (dst < end4)
1631                         {
1632                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1633                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1634                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1635                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1637                                 dst += 16;
1638                                 src += 4*sizeof(unsigned char[4]);
1639                         }
1640                 }
1641         }
1642         while (dst < end)
1643         {
1644                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1645                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1646                 dst += 4;
1647                 src += stride;
1648         }
1649 }
1650
1651 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1652 {
1653         float *end = dst + 4*size;
1654         __m128 v = _mm_loadu_ps(src);
1655         while (dst < end)
1656         {
1657                 _mm_store_ps(dst, v);
1658                 dst += 4;
1659         }
1660 }
1661 #endif
1662
1663 void DPSOFTRAST_Draw_LoadVertices(int firstvertex, int numvertices, bool needcolors)
1664 {
1665 #ifdef SSE2_PRESENT
1666         int i;
1667         int j;
1668         int stride;
1669         const float *v;
1670         float *p;
1671         float *data;
1672         const unsigned char *b;
1673         dpsoftrast.numvertices = numvertices;
1674         if (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1675         {
1676                 if (dpsoftrast.maxvertices < 4096)
1677                         dpsoftrast.maxvertices = 4096;
1678                 while (dpsoftrast.maxvertices < dpsoftrast.numvertices)
1679                         dpsoftrast.maxvertices *= 2;
1680                 if (dpsoftrast.in_array4f[0])
1681                         MM_FREE(dpsoftrast.in_array4f[0]);
1682                 data = (float *)MM_CALLOC(1, dpsoftrast.maxvertices * sizeof(float[4])*(DPSOFTRAST_ARRAY_TOTAL*2 + 1));
1683                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1684                         dpsoftrast.in_array4f[i] = data;
1685                 for (i = 0;i < DPSOFTRAST_ARRAY_TOTAL;i++, data += dpsoftrast.maxvertices * 4)
1686                         dpsoftrast.post_array4f[i] = data;
1687                 dpsoftrast.screencoord4f = data;
1688                 data += dpsoftrast.maxvertices * 4;
1689         }
1690         stride = dpsoftrast.stride_vertex;
1691         v = (const float *)((unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride);
1692         p = dpsoftrast.in_array4f[0];
1693         DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1694         if (needcolors)
1695         {
1696                 if (dpsoftrast.pointer_color4f)
1697                 {
1698                         stride = dpsoftrast.stride_color;
1699                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride);
1700                         p = dpsoftrast.in_array4f[1];
1701                         DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1702                 }
1703                 else if (dpsoftrast.pointer_color4ub)
1704                 {
1705                         stride = dpsoftrast.stride_color;
1706                         b = (const unsigned char *)((const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride);
1707                         p = dpsoftrast.in_array4f[1];
1708                         DPSOFTRAST_Load4bTo4f(p, b, numvertices, stride);
1709                 }
1710                 else
1711                 {
1712                         p = dpsoftrast.in_array4f[1];
1713                         DPSOFTRAST_Fill4f(p, dpsoftrast.color, numvertices);
1714                 }
1715         }
1716         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL-2;j++)
1717         {
1718                 if (dpsoftrast.pointer_texcoordf[j])
1719                 {
1720                         stride = dpsoftrast.stride_texcoord[j];
1721                         v = (const float *)((const unsigned char *)dpsoftrast.pointer_texcoordf[j] + firstvertex * stride);
1722                         p = dpsoftrast.in_array4f[j+2];
1723                         switch(dpsoftrast.components_texcoord[j])
1724                         {
1725                         case 2:
1726                                 DPSOFTRAST_Load2fTo4f(p, (const unsigned char *)v, numvertices, stride);
1727                                 break;
1728                         case 3:
1729                                 DPSOFTRAST_Load3fTo4f(p, (const unsigned char *)v, numvertices, stride);
1730                                 break;
1731                         case 4:
1732                                 DPSOFTRAST_Load4fTo4f(p, (const unsigned char *)v, numvertices, stride);
1733                                 break;
1734                         }
1735                 }
1736         }
1737 #endif
1738 }
1739
1740 void DPSOFTRAST_Array_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1741 {
1742 #ifdef SSE2_PRESENT
1743         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1744         __m128 m0, m1, m2, m3;
1745         float *end = out4f + numitems*4;
1746         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1747         {
1748                 // fast case for identity matrix
1749                 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1750                 return;
1751         }
1752         m0 = _mm_loadu_ps(inmatrix16f);
1753         m1 = _mm_loadu_ps(inmatrix16f + 4);
1754         m2 = _mm_loadu_ps(inmatrix16f + 8);
1755         m3 = _mm_loadu_ps(inmatrix16f + 12);
1756         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1757         {
1758                 while (out4f < end)
1759                 {
1760                         __m128 v = _mm_loadu_ps(in4f);
1761                         _mm_store_ps(out4f,
1762                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1763                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1764                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1765                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1766                         out4f += 4;
1767                         in4f += 4;
1768                 }
1769         }
1770         else
1771         {
1772                 while (out4f < end)
1773                 {
1774                         __m128 v = _mm_load_ps(in4f);
1775                         _mm_store_ps(out4f,
1776                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1777                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1778                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1779                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1780                         out4f += 4;
1781                         in4f += 4;
1782                 }
1783         }
1784 #endif
1785 }
1786
1787 void DPSOFTRAST_Array_Copy(float *out4f, const float *in4f, int numitems)
1788 {
1789         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1790 }
1791
1792 #ifdef SSE2_PRESENT
1793 static __m128 DPSOFTRAST_Draw_ProjectVertex(__m128 v)
1794 {
1795         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1796         __m128 w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1797         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1798         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1799         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1800         return v;
1801 }
1802 #endif
1803
1804 void DPSOFTRAST_Array_Project(float *out4f, float *screen4f, const float *in4f, int numitems)
1805 {
1806 #ifdef SSE2_PRESENT
1807         float *end = out4f + numitems*4;
1808         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1809         while (out4f < end)
1810         {
1811                 __m128 v = _mm_load_ps(in4f), w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1812                 _mm_store_ps(out4f, v);
1813                 v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1814                 v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1815                 _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1816                 in4f += 4;
1817                 out4f += 4;
1818                 screen4f += 4;
1819         }
1820 #endif
1821 }
1822
1823 void DPSOFTRAST_Array_TransformProject(float *out4f, float *screen4f, const float *in4f, int numitems, const float *inmatrix16f)
1824 {
1825 #ifdef SSE2_PRESENT
1826         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1827         __m128 m0, m1, m2, m3, viewportcenter, viewportscale;
1828         float *end = out4f + numitems*4;
1829         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1830         {
1831                 DPSOFTRAST_Array_Project(out4f, screen4f, in4f, numitems);
1832                 return;
1833         }
1834         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1835         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1836         m0 = _mm_loadu_ps(inmatrix16f);
1837         m1 = _mm_loadu_ps(inmatrix16f + 4);
1838         m2 = _mm_loadu_ps(inmatrix16f + 8);
1839         m3 = _mm_loadu_ps(inmatrix16f + 12);
1840         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1841         {
1842                 while (out4f < end)
1843                 {
1844                         __m128 v = _mm_loadu_ps(in4f), w;
1845                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1846                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1847                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1848                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1849                         _mm_store_ps(out4f, v);
1850                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1851                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1852                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1853                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1854                         in4f += 4;
1855                         out4f += 4;
1856                         screen4f += 4;
1857                 }
1858         }
1859         else
1860         {
1861                 while (out4f < end)
1862                 {
1863                         __m128 v = _mm_load_ps(in4f), w;
1864                         v = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1865                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1866                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1867                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3))));
1868                         _mm_store_ps(out4f, v);
1869                         w = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
1870                         v = _mm_move_ss(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f));
1871                         v = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, v), w));
1872                         _mm_store_ps(screen4f, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)));
1873                         in4f += 4;
1874                         out4f += 4;
1875                         screen4f += 4;
1876                 }
1877         }
1878 #endif
1879 }
1880
1881 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1882 {
1883         int x;
1884         int startx = span->startx;
1885         int endx = span->endx;
1886         float wslope = triangle->w[0];
1887         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1888         float endz = 1.0f / (w + wslope * startx);
1889         for (x = startx;x < endx;)
1890         {
1891                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1892                 float z = endz, dz;
1893                 if(nextsub >= endx) nextsub = endsub = endx-1;
1894                 endz = 1.0f / (w + wslope * nextsub);
1895                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1896                 for (; x <= endsub; x++, z += dz)
1897                         zf[x] = z;
1898         }
1899 }
1900
1901 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1902 {
1903         int x;
1904         int startx = span->startx;
1905         int endx = span->endx;
1906         int d[4];
1907         float a, b;
1908         unsigned char * RESTRICT pixelmask = span->pixelmask;
1909         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1910         if (!pixel)
1911                 return;
1912         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1913         // handle alphatest now (this affects depth writes too)
1914         if (thread->alphatest)
1915                 for (x = startx;x < endx;x++)
1916                         if (in4f[x*4+3] < 0.5f)
1917                                 pixelmask[x] = false;
1918         // FIXME: this does not handle bigendian
1919         switch(thread->fb_blendmode)
1920         {
1921         case DPSOFTRAST_BLENDMODE_OPAQUE:
1922                 for (x = startx;x < endx;x++)
1923                 {
1924                         if (!pixelmask[x])
1925                                 continue;
1926                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1927                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1928                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1929                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1930                         pixel[x*4+0] = d[0];
1931                         pixel[x*4+1] = d[1];
1932                         pixel[x*4+2] = d[2];
1933                         pixel[x*4+3] = d[3];
1934                 }
1935                 break;
1936         case DPSOFTRAST_BLENDMODE_ALPHA:
1937                 for (x = startx;x < endx;x++)
1938                 {
1939                         if (!pixelmask[x])
1940                                 continue;
1941                         a = in4f[x*4+3] * 255.0f;
1942                         b = 1.0f - in4f[x*4+3];
1943                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1944                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1945                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1946                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1947                         pixel[x*4+0] = d[0];
1948                         pixel[x*4+1] = d[1];
1949                         pixel[x*4+2] = d[2];
1950                         pixel[x*4+3] = d[3];
1951                 }
1952                 break;
1953         case DPSOFTRAST_BLENDMODE_ADDALPHA:
1954                 for (x = startx;x < endx;x++)
1955                 {
1956                         if (!pixelmask[x])
1957                                 continue;
1958                         a = in4f[x*4+3] * 255.0f;
1959                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1960                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1961                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1962                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1963                         pixel[x*4+0] = d[0];
1964                         pixel[x*4+1] = d[1];
1965                         pixel[x*4+2] = d[2];
1966                         pixel[x*4+3] = d[3];
1967                 }
1968                 break;
1969         case DPSOFTRAST_BLENDMODE_ADD:
1970                 for (x = startx;x < endx;x++)
1971                 {
1972                         if (!pixelmask[x])
1973                                 continue;
1974                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1975                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1976                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1977                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1978                         pixel[x*4+0] = d[0];
1979                         pixel[x*4+1] = d[1];
1980                         pixel[x*4+2] = d[2];
1981                         pixel[x*4+3] = d[3];
1982                 }
1983                 break;
1984         case DPSOFTRAST_BLENDMODE_INVMOD:
1985                 for (x = startx;x < endx;x++)
1986                 {
1987                         if (!pixelmask[x])
1988                                 continue;
1989                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
1990                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
1991                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
1992                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
1993                         pixel[x*4+0] = d[0];
1994                         pixel[x*4+1] = d[1];
1995                         pixel[x*4+2] = d[2];
1996                         pixel[x*4+3] = d[3];
1997                 }
1998                 break;
1999         case DPSOFTRAST_BLENDMODE_MUL:
2000                 for (x = startx;x < endx;x++)
2001                 {
2002                         if (!pixelmask[x])
2003                                 continue;
2004                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2005                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2006                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2007                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2008                         pixel[x*4+0] = d[0];
2009                         pixel[x*4+1] = d[1];
2010                         pixel[x*4+2] = d[2];
2011                         pixel[x*4+3] = d[3];
2012                 }
2013                 break;
2014         case DPSOFTRAST_BLENDMODE_MUL2:
2015                 for (x = startx;x < endx;x++)
2016                 {
2017                         if (!pixelmask[x])
2018                                 continue;
2019                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2020                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2021                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2022                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2023                         pixel[x*4+0] = d[0];
2024                         pixel[x*4+1] = d[1];
2025                         pixel[x*4+2] = d[2];
2026                         pixel[x*4+3] = d[3];
2027                 }
2028                 break;
2029         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2030                 for (x = startx;x < endx;x++)
2031                 {
2032                         if (!pixelmask[x])
2033                                 continue;
2034                         a = in4f[x*4+3] * -255.0f;
2035                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2036                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2037                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2038                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2039                         pixel[x*4+0] = d[0];
2040                         pixel[x*4+1] = d[1];
2041                         pixel[x*4+2] = d[2];
2042                         pixel[x*4+3] = d[3];
2043                 }
2044                 break;
2045         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2046                 for (x = startx;x < endx;x++)
2047                 {
2048                         if (!pixelmask[x])
2049                                 continue;
2050                         a = 255.0f;
2051                         b = 1.0f - in4f[x*4+3];
2052                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2053                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2054                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2055                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2056                         pixel[x*4+0] = d[0];
2057                         pixel[x*4+1] = d[1];
2058                         pixel[x*4+2] = d[2];
2059                         pixel[x*4+3] = d[3];
2060                 }
2061                 break;
2062         }
2063 }
2064
2065 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2066 {
2067 #ifdef SSE2_PRESENT
2068         int x;
2069         int startx = span->startx;
2070         int endx = span->endx;
2071         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2072         unsigned char * RESTRICT pixelmask = span->pixelmask;
2073         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2074         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2075         if (!pixel)
2076                 return;
2077         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2078         pixeli += span->y * dpsoftrast.fb_width + span->x;
2079         // handle alphatest now (this affects depth writes too)
2080         if (thread->alphatest)
2081                 for (x = startx;x < endx;x++)
2082                         if (in4ub[x*4+3] < 0.5f)
2083                                 pixelmask[x] = false;
2084         // FIXME: this does not handle bigendian
2085         switch(thread->fb_blendmode)
2086         {
2087         case DPSOFTRAST_BLENDMODE_OPAQUE:
2088                 for (x = startx;x + 4 <= endx;)
2089                 {
2090                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2091                         {
2092                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2093                                 x += 4;
2094                         }
2095                         else
2096                         {
2097                                 if (pixelmask[x])
2098                                         pixeli[x] = ini[x];
2099                                 x++;
2100                         }
2101                 }
2102                 for (;x < endx;x++)
2103                         if (pixelmask[x])
2104                                 pixeli[x] = ini[x];
2105                 break;
2106         case DPSOFTRAST_BLENDMODE_ALPHA:
2107         #define FINISHBLEND(blend2, blend1) \
2108                 for (x = startx;x + 2 <= endx;x += 2) \
2109                 { \
2110                         __m128i src, dst; \
2111                         switch (*(const unsigned short*)&pixelmask[x]) \
2112                         { \
2113                         case 0x0101: \
2114                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2115                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2116                                 blend2; \
2117                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2118                                 continue; \
2119                         case 0x0100: \
2120                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2121                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2122                                 blend1; \
2123                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2124                                 continue; \
2125                         case 0x0001: \
2126                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2127                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2128                                 blend1; \
2129                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2130                                 continue; \
2131                         } \
2132                         break; \
2133                 } \
2134                 for(;x < endx; x++) \
2135                 { \
2136                         __m128i src, dst; \
2137                         if (!pixelmask[x]) \
2138                                 continue; \
2139                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2140                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2141                         blend1; \
2142                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2143                 }
2144
2145                 FINISHBLEND({
2146                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2147                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2148                 }, {
2149                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2150                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2151                 });
2152                 break;
2153         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2154                 FINISHBLEND({
2155                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2156                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2157                 }, {
2158                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2159                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2160                 });
2161                 break;
2162         case DPSOFTRAST_BLENDMODE_ADD:
2163                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2164                 break;
2165         case DPSOFTRAST_BLENDMODE_INVMOD:
2166                 FINISHBLEND({
2167                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2168                 }, {
2169                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2170                 });
2171                 break;
2172         case DPSOFTRAST_BLENDMODE_MUL:
2173                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2174                 break;
2175         case DPSOFTRAST_BLENDMODE_MUL2:
2176                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2177                 break;
2178         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2179                 FINISHBLEND({
2180                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2181                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2182                 }, {
2183                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2184                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2185                 });
2186                 break;
2187         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2188                 FINISHBLEND({
2189                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2190                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2191                 }, {
2192                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2193                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2194                 });
2195                 break;
2196         }
2197 #endif
2198 }
2199
2200 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2201 {
2202         int x;
2203         int startx = span->startx;
2204         int endx = span->endx;
2205         int flags;
2206         float c[4];
2207         float data[4];
2208         float slope[4];
2209         float tc[2], endtc[2];
2210         float tcscale[2];
2211         unsigned int tci[2];
2212         unsigned int tci1[2];
2213         unsigned int tcimin[2];
2214         unsigned int tcimax[2];
2215         int tciwrapmask[2];
2216         int tciwidth;
2217         int filter;
2218         int mip;
2219         const unsigned char * RESTRICT pixelbase;
2220         const unsigned char * RESTRICT pixel[4];
2221         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2222         // if no texture is bound, just fill it with white
2223         if (!texture)
2224         {
2225                 for (x = startx;x < endx;x++)
2226                 {
2227                         out4f[x*4+0] = 1.0f;
2228                         out4f[x*4+1] = 1.0f;
2229                         out4f[x*4+2] = 1.0f;
2230                         out4f[x*4+3] = 1.0f;
2231                 }
2232                 return;
2233         }
2234         mip = triangle->mip[texunitindex];
2235         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2236         // if this mipmap of the texture is 1 pixel, just fill it with that color
2237         if (texture->mipmap[mip][1] == 4)
2238         {
2239                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2240                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2241                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2242                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2243                 for (x = startx;x < endx;x++)
2244                 {
2245                         out4f[x*4+0] = c[0];
2246                         out4f[x*4+1] = c[1];
2247                         out4f[x*4+2] = c[2];
2248                         out4f[x*4+3] = c[3];
2249                 }
2250                 return;
2251         }
2252         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2253         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2254         flags = texture->flags;
2255         tcscale[0] = texture->mipmap[mip][2];
2256         tcscale[1] = texture->mipmap[mip][3];
2257         tciwidth = texture->mipmap[mip][2];
2258         tcimin[0] = 0;
2259         tcimin[1] = 0;
2260         tcimax[0] = texture->mipmap[mip][2]-1;
2261         tcimax[1] = texture->mipmap[mip][3]-1;
2262         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2263         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2264         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2265         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2266         for (x = startx;x < endx;)
2267         {
2268                 unsigned int subtc[2];
2269                 unsigned int substep[2];
2270                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2271                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2272                 if(nextsub >= endx)
2273                 {
2274                         nextsub = endsub = endx-1;      
2275                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2276                 }
2277                 tc[0] = endtc[0];
2278                 tc[1] = endtc[1];
2279                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2280                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2281                 substep[0] = (endtc[0] - tc[0]) * subscale;
2282                 substep[1] = (endtc[1] - tc[1]) * subscale;
2283                 subtc[0] = tc[0] * (1<<16);
2284                 subtc[1] = tc[1] * (1<<16);
2285                 if(filter)
2286                 {
2287                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2288                         {
2289                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2290                                 {
2291                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2292                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2293                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2294                                         tci[0] = subtc[0]>>16;
2295                                         tci[1] = subtc[1]>>16;
2296                                         tci1[0] = tci[0] + 1;
2297                                         tci1[1] = tci[1] + 1;
2298                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2299                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2300                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2301                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2302                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2303                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2304                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2305                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2306                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2307                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2308                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2309                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2310                                         out4f[x*4+0] = c[0];
2311                                         out4f[x*4+1] = c[1];
2312                                         out4f[x*4+2] = c[2];
2313                                         out4f[x*4+3] = c[3];
2314                                 }
2315                         }
2316                         else
2317                         {
2318                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2319                                 {
2320                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2321                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2322                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2323                                         tci[0] = subtc[0]>>16;
2324                                         tci[1] = subtc[1]>>16;
2325                                         tci1[0] = tci[0] + 1;
2326                                         tci1[1] = tci[1] + 1;
2327                                         tci[0] &= tciwrapmask[0];
2328                                         tci[1] &= tciwrapmask[1];
2329                                         tci1[0] &= tciwrapmask[0];
2330                                         tci1[1] &= tciwrapmask[1];
2331                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2332                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2333                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2334                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2335                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2336                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2337                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2338                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2339                                         out4f[x*4+0] = c[0];
2340                                         out4f[x*4+1] = c[1];
2341                                         out4f[x*4+2] = c[2];
2342                                         out4f[x*4+3] = c[3];
2343                                 }
2344                         }
2345                 }
2346                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2347                 {
2348                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2349                         {
2350                                 tci[0] = subtc[0]>>16;
2351                                 tci[1] = subtc[1]>>16;
2352                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2353                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2354                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2356                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2357                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2358                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2359                                 out4f[x*4+0] = c[0];
2360                                 out4f[x*4+1] = c[1];
2361                                 out4f[x*4+2] = c[2];
2362                                 out4f[x*4+3] = c[3];
2363                         }
2364                 }
2365                 else
2366                 {
2367                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2368                         {
2369                                 tci[0] = subtc[0]>>16;
2370                                 tci[1] = subtc[1]>>16;
2371                                 tci[0] &= tciwrapmask[0];
2372                                 tci[1] &= tciwrapmask[1];
2373                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2374                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2375                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2376                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2377                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2378                                 out4f[x*4+0] = c[0];
2379                                 out4f[x*4+1] = c[1];
2380                                 out4f[x*4+2] = c[2];
2381                                 out4f[x*4+3] = c[3];
2382                         }
2383                 }
2384         }
2385 }
2386
2387 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2388 {
2389 #ifdef SSE2_PRESENT
2390         int x;
2391         int startx = span->startx;
2392         int endx = span->endx;
2393         int flags;
2394         __m128 data, slope, tcscale;
2395         __m128i tcsize, tcmask, tcoffset, tcmax;
2396         __m128 tc, endtc;
2397         __m128i subtc, substep, endsubtc;
2398         int filter;
2399         int mip;
2400         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2401         const unsigned char * RESTRICT pixelbase;
2402         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2403         // if no texture is bound, just fill it with white
2404         if (!texture)
2405         {
2406                 memset(out4ub + startx*4, 255, span->length*4);
2407                 return;
2408         }
2409         mip = triangle->mip[texunitindex];
2410         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2411         // if this mipmap of the texture is 1 pixel, just fill it with that color
2412         if (texture->mipmap[mip][1] == 4)
2413         {
2414                 unsigned int k = *((const unsigned int *)pixelbase);
2415                 for (x = startx;x < endx;x++)
2416                         outi[x] = k;
2417                 return;
2418         }
2419         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2420         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2421         flags = texture->flags;
2422         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2423         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2424         tcscale = _mm_cvtepi32_ps(tcsize);
2425         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2426         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2427         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2428         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2429         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2430         tcmax = _mm_packs_epi32(tcmask, tcmask);
2431         for (x = startx;x < endx;)
2432         {
2433                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2434                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2435                 if(nextsub >= endx)
2436                 {
2437                         nextsub = endsub = endx-1;
2438                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2439                 }       
2440                 tc = endtc;
2441                 subtc = endsubtc;
2442                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2443                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2444                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2445                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2446                 substep = _mm_slli_epi32(substep, 1);
2447                 if (filter)
2448                 {
2449                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2450                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2451                         {
2452                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2453                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2454                                 {
2455                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2456                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2457                                         tci = _mm_madd_epi16(tci, tcoffset);
2458                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2459                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2460                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2461                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2462                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2463                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2464                                         fracm = _mm_srli_epi16(subtc, 1);
2465                                         pix1 = _mm_add_epi16(pix1,
2466                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2467                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2468                                         pix3 = _mm_add_epi16(pix3,
2469                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2470                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2471                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2472                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2473                                         pix2 = _mm_add_epi16(pix2,
2474                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2475                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2476                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2477                                 }
2478                                 if (x <= endsub)
2479                                 {
2480                                         const unsigned char * RESTRICT ptr1;
2481                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2482                                         tci = _mm_madd_epi16(tci, tcoffset);
2483                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2484                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2485                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2486                                         fracm = _mm_srli_epi16(subtc, 1);
2487                                         pix1 = _mm_add_epi16(pix1,
2488                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2489                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2490                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2491                                         pix1 = _mm_add_epi16(pix1,
2492                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2493                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2494                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2495                                         x++;
2496                                 }
2497                         }
2498                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2499                         {
2500                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2501                                 {
2502                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2503                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2504                                         tci = _mm_madd_epi16(tci, tcoffset);
2505                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2506                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2507                                                                                         _mm_setzero_si128());
2508                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2509                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2510                                                                                         _mm_setzero_si128());
2511                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2512                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2513                                         tci = _mm_madd_epi16(tci, tcoffset);
2514                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2515                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2516                                                                                         _mm_setzero_si128());
2517                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2518                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2519                                                                                         _mm_setzero_si128());
2520                                         fracm = _mm_srli_epi16(subtc, 1);
2521                                         pix1 = _mm_add_epi16(pix1,
2522                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2523                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2524                                         pix3 = _mm_add_epi16(pix3,
2525                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2526                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2527                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2528                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2529                                         pix2 = _mm_add_epi16(pix2,
2530                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2531                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2532                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2533                                 }
2534                                 if (x <= endsub)
2535                                 {
2536                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2537                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2538                                         tci = _mm_madd_epi16(tci, tcoffset);
2539                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2540                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2541                                                                                         _mm_setzero_si128());
2542                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2543                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2544                                                                                         _mm_setzero_si128());
2545                                         fracm = _mm_srli_epi16(subtc, 1);
2546                                         pix1 = _mm_add_epi16(pix1,
2547                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2548                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2549                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2550                                         pix1 = _mm_add_epi16(pix1,
2551                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2552                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2553                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2554                                         x++;
2555                                 }
2556                         }
2557                         else
2558                         {
2559                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2560                                 {
2561                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2562                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2563                                         tci = _mm_madd_epi16(tci, tcoffset);
2564                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2565                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2566                                                                                         _mm_setzero_si128());
2567                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2568                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2569                                                                                         _mm_setzero_si128());
2570                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2571                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2572                                         tci = _mm_madd_epi16(tci, tcoffset);
2573                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2574                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2575                                                                                         _mm_setzero_si128());
2576                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2577                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2578                                                                                         _mm_setzero_si128());
2579                                         fracm = _mm_srli_epi16(subtc, 1);
2580                                         pix1 = _mm_add_epi16(pix1,
2581                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2582                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2583                                         pix3 = _mm_add_epi16(pix3,
2584                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2585                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2586                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2587                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2588                                         pix2 = _mm_add_epi16(pix2,
2589                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2590                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2591                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2592                                 }
2593                                 if (x <= endsub)
2594                                 {
2595                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2596                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2597                                         tci = _mm_madd_epi16(tci, tcoffset);
2598                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2599                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2600                                                                                         _mm_setzero_si128());
2601                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2602                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2603                                                                                         _mm_setzero_si128());
2604                                         fracm = _mm_srli_epi16(subtc, 1);
2605                                         pix1 = _mm_add_epi16(pix1,
2606                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2607                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2608                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2609                                         pix1 = _mm_add_epi16(pix1,
2610                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2612                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2613                                         x++;
2614                                 }
2615                         }
2616                 }
2617                 else
2618                 {
2619                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2620                         {
2621                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2622                                 {
2623                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2624                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2627                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2628                                 }
2629                                 if (x <= endsub)
2630                                 {
2631                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2632                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2633                                         tci = _mm_madd_epi16(tci, tcoffset);
2634                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2635                                         x++;
2636                                 }
2637                         }
2638                         else
2639                         {
2640                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2641                                 {
2642                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2643                                         tci = _mm_and_si128(tci, tcmax); 
2644                                         tci = _mm_madd_epi16(tci, tcoffset);
2645                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2646                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2647                                 }
2648                                 if (x <= endsub)
2649                                 {
2650                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2651                                         tci = _mm_and_si128(tci, tcmax); 
2652                                         tci = _mm_madd_epi16(tci, tcoffset);
2653                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2654                                         x++;
2655                                 }
2656                         }
2657                 }
2658         }
2659 #endif
2660 }
2661
2662 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2663 {
2664         // TODO: IMPLEMENT
2665         memset(out4ub, 255, span->length*4);
2666 }
2667
2668 float DPSOFTRAST_SampleShadowmap(const float *vector)
2669 {
2670         // TODO: IMPLEMENT
2671         return 1.0f;
2672 }
2673
2674 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2675 {
2676         int x;
2677         int startx = span->startx;
2678         int endx = span->endx;
2679         float c[4];
2680         float data[4];
2681         float slope[4];
2682         float z;
2683         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2684         for (x = startx;x < endx;x++)
2685         {
2686                 z = zf[x];
2687                 c[0] = (data[0] + slope[0]*x) * z;
2688                 c[1] = (data[1] + slope[1]*x) * z;
2689                 c[2] = (data[2] + slope[2]*x) * z;
2690                 c[3] = (data[3] + slope[3]*x) * z;
2691                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2692                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2693                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2694                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2695         }
2696 }
2697
2698 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2699 {
2700         int x;
2701         int startx = span->startx;
2702         int endx = span->endx;
2703         float c[4];
2704         float data[4];
2705         float slope[4];
2706         float z;
2707         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2708         for (x = startx;x < endx;x++)
2709         {
2710                 z = zf[x];
2711                 c[0] = (data[0] + slope[0]*x) * z;
2712                 c[1] = (data[1] + slope[1]*x) * z;
2713                 c[2] = (data[2] + slope[2]*x) * z;
2714                 c[3] = (data[3] + slope[3]*x) * z;
2715                 out4f[x*4+0] = c[0];
2716                 out4f[x*4+1] = c[1];
2717                 out4f[x*4+2] = c[2];
2718                 out4f[x*4+3] = c[3];
2719         }
2720 }
2721
2722 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2723 {
2724         int x, startx = span->startx, endx = span->endx;
2725         float c[4], localcolor[4];
2726         localcolor[0] = subcolor[0];
2727         localcolor[1] = subcolor[1];
2728         localcolor[2] = subcolor[2];
2729         localcolor[3] = subcolor[3];
2730         for (x = startx;x < endx;x++)
2731         {
2732                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2733                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2734                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2735                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2736                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2737                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2738                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2739                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2740         }
2741 }
2742
2743 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2744 {
2745         int x, startx = span->startx, endx = span->endx;
2746         for (x = startx;x < endx;x++)
2747         {
2748                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2749                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2750                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2751                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2752         }
2753 }
2754
2755 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2756 {
2757         int x, startx = span->startx, endx = span->endx;
2758         for (x = startx;x < endx;x++)
2759         {
2760                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2761                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2762                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2763                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2764         }
2765 }
2766
2767 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2768 {
2769         int x, startx = span->startx, endx = span->endx;
2770         float a, b;
2771         for (x = startx;x < endx;x++)
2772         {
2773                 a = 1.0f - inb4f[x*4+3];
2774                 b = inb4f[x*4+3];
2775                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2776                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2777                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2778                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2779         }
2780 }
2781
2782 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2783 {
2784         int x, startx = span->startx, endx = span->endx;
2785         float localcolor[4], ilerp, lerp;
2786         localcolor[0] = color[0];
2787         localcolor[1] = color[1];
2788         localcolor[2] = color[2];
2789         localcolor[3] = color[3];
2790         ilerp = 1.0f - localcolor[3];
2791         lerp = localcolor[3];
2792         for (x = startx;x < endx;x++)
2793         {
2794                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2795                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2796                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2797                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2798         }
2799 }
2800
2801
2802
2803 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2804 {
2805 #ifdef SSE2_PRESENT
2806         int x;
2807         int startx = span->startx;
2808         int endx = span->endx;
2809         __m128 data, slope;
2810         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2811         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2812         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2813         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2814         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2815         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2816         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2817         {
2818                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2819                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2820                 data = _mm_add_ps(data, slope);
2821                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2822                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2823                 pix = _mm_mulhi_epu16(pix, mod);
2824                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2825         }
2826         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2827         {
2828                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2829                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2830                 mod = _mm_packs_epi32(mod, mod);
2831                 pix = _mm_mulhi_epu16(pix, mod);
2832                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2833         }
2834 #endif
2835 }
2836
2837 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2838 {
2839 #ifdef SSE2_PRESENT
2840         int x;
2841         int startx = span->startx;
2842         int endx = span->endx;
2843         __m128 data, slope;
2844         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2845         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2846         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2847         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2848         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2849         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2850         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2851         {
2852                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2853                 data = _mm_add_ps(data, slope);
2854                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2855                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2856                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2857         }
2858         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2859         {
2860                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2861                 pix = _mm_packs_epi32(pix, pix);
2862                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2863         }
2864 #endif
2865 }
2866
2867 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2868 {
2869 #ifdef SSE2_PRESENT
2870         int x, startx = span->startx, endx = span->endx;
2871         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2872         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2873         for (x = startx;x+2 <= endx;x+=2)
2874         {
2875                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2876                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2877                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2878                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2879         }
2880         if(x < endx)
2881         {
2882                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2883                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2884                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2885                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2886         }
2887 #endif
2888 }
2889
2890 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2891 {
2892 #ifdef SSE2_PRESENT
2893         int x, startx = span->startx, endx = span->endx;
2894         for (x = startx;x+2 <= endx;x+=2)
2895         {
2896                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2897                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2898                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2899                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2900         }
2901         if(x < endx)
2902         {
2903                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2904                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2905                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2906                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2907         }
2908 #endif
2909 }
2910
2911 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2912 {
2913 #ifdef SSE2_PRESENT
2914         int x, startx = span->startx, endx = span->endx;
2915         for (x = startx;x+2 <= endx;x+=2)
2916         {
2917                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2918                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2919                 pix1 = _mm_add_epi16(pix1, pix2);
2920                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2921         }
2922         if(x < endx)
2923         {
2924                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2925                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2926                 pix1 = _mm_add_epi16(pix1, pix2);
2927                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2928         }
2929 #endif
2930 }
2931
2932 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2933 {
2934 #ifdef SSE2_PRESENT
2935         int x, startx = span->startx, endx = span->endx;
2936         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2937         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2938         for (x = startx;x+2 <= endx;x+=2)
2939         {
2940                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2941                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2942                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2943                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2944         }
2945         if(x < endx)
2946         {
2947                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2948                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2949                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2950                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2951         }
2952 #endif
2953 }
2954
2955 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2956 {
2957 #ifdef SSE2_PRESENT
2958         int x, startx = span->startx, endx = span->endx;
2959         for (x = startx;x+2 <= endx;x+=2)
2960         {
2961                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2962                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2963                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2964                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2965                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2966         }
2967         if(x < endx)
2968         {
2969                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2970                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2971                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
2972                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
2973                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2974         }
2975 #endif
2976 }
2977
2978 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
2979 {
2980 #ifdef SSE2_PRESENT
2981         int x, startx = span->startx, endx = span->endx;
2982         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
2983         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2984         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
2985         for (x = startx;x+2 <= endx;x+=2)
2986         {
2987                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
2988                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2989                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2990         }
2991         if(x < endx)
2992         {
2993                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
2994                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
2995                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2996         }
2997 #endif
2998 }
2999
3000
3001
3002 void DPSOFTRAST_VertexShader_Generic(void)
3003 {
3004         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3005         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3006         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3007         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3008                 DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3009 }
3010
3011 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3012 {
3013         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3014         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3015         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3016         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3017         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3018         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3019         {
3020                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3021                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3022                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3023                 {
3024                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3025                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3026                         {
3027                                 // multiply
3028                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3029                         }
3030                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3031                         {
3032                                 // add
3033                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3034                         }
3035                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3036                         {
3037                                 // alphablend
3038                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3039                         }
3040                 }
3041         }
3042         else
3043                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3044         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3045 }
3046
3047
3048
3049 void DPSOFTRAST_VertexShader_PostProcess(void)
3050 {
3051         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3052         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices);
3053         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1], dpsoftrast.numvertices);
3054 }
3055
3056 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3057 {
3058         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3059         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3060         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3061         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3062         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3063         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3064         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3065         {
3066                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3067                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3068         }
3069         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3070         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3071         {
3072                 // TODO: implement saturation
3073         }
3074         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3075         {
3076                 // TODO: implement gammaramps
3077         }
3078         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3079 }
3080
3081
3082
3083 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3084 {
3085         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3086 }
3087
3088 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3089 {
3090         // this is never called (because colormask is off when this shader is used)
3091         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3092         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3093         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3094         memset(buffer_FragColorbgra8, 0, span->length*4);
3095         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3096 }
3097
3098
3099
3100 void DPSOFTRAST_VertexShader_FlatColor(void)
3101 {
3102         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3103         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3104 }
3105
3106 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3107 {
3108         int x, startx = span->startx, endx = span->endx;
3109         int Color_Ambienti[4];
3110         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3111         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3112         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3114         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3115         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3116         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3117         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3118         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3119         for (x = startx;x < endx;x++)
3120         {
3121                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3122                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3123                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3124                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3125         }
3126         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3127 }
3128
3129
3130
3131 void DPSOFTRAST_VertexShader_VertexColor(void)
3132 {
3133         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3134         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_COLOR], dpsoftrast.numvertices);
3135         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3136 }
3137
3138 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3139 {
3140 #ifdef SSE2_PRESENT
3141         unsigned char * RESTRICT pixelmask = span->pixelmask;
3142         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3143         int x, startx = span->startx, endx = span->endx;
3144         __m128i Color_Ambientm, Color_Diffusem;
3145         __m128 data, slope;
3146         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3147         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3148         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3149         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3150         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3151         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3152         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3153                 pixel = buffer_FragColorbgra8;
3154         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3155         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3156         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3157         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3158         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3159         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3160         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3161         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3162         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3163         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3164         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3165         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3166         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3167         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3168         {
3169                 __m128i color, mod, pix;
3170                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3171                 {
3172                         __m128i pix2, mod2;
3173                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3174                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3175                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3176                         data = _mm_add_ps(data, slope);
3177                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3178                         data = _mm_add_ps(data, slope);
3179                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3180                         data = _mm_add_ps(data, slope);
3181                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3182                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3183                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3184                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3185                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3186                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3187                         x += 3;
3188                         continue;
3189                 }
3190                 if(!pixelmask[x])
3191                         continue;
3192                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3193                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3194                 mod = _mm_packs_epi32(mod, mod);
3195                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3196                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3197         }
3198         if(pixel == buffer_FragColorbgra8)
3199                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3200 #endif
3201 }
3202
3203
3204
3205 void DPSOFTRAST_VertexShader_Lightmap(void)
3206 {
3207         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3208         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3209         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3210 }
3211
3212 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 {
3214 #ifdef SSE2_PRESENT
3215         unsigned char * RESTRICT pixelmask = span->pixelmask;
3216         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3217         int x, startx = span->startx, endx = span->endx;
3218         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3219         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3220         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3221         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3222         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3223         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3225         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3226         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3227         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3228                 pixel = buffer_FragColorbgra8;
3229         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3230         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3231         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3232         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3233         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3234         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3235         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3236         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3237         {
3238                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3239                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3240                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3241                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3242                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3243                 for (x = startx;x < endx;x++)
3244                 {
3245                         __m128i color, lightmap, glow, pix;
3246                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3247                         {
3248                                 __m128i pix2;
3249                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3250                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3251                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3252                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3253                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3254                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3255                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3256                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3257                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3258                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3259                                 x += 3;
3260                                 continue;
3261                         }
3262                         if(!pixelmask[x])
3263                                 continue;
3264                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3265                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3266                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3267                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3268                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3269                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3270                 }
3271         }
3272         else
3273         {
3274                 for (x = startx;x < endx;x++)
3275                 {
3276                         __m128i color, lightmap, pix;
3277                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3278                         {
3279                                 __m128i pix2;
3280                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3281                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3282                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3283                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3284                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3285                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3286                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3287                                 x += 3;
3288                                 continue;
3289                         }
3290                         if(!pixelmask[x]) 
3291                                 continue;
3292                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3293                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3294                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3295                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3296                 }
3297         }
3298         if(pixel == buffer_FragColorbgra8)
3299                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3300 #endif
3301 }
3302
3303
3304
3305 void DPSOFTRAST_VertexShader_FakeLight(void)
3306 {
3307         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3308 }
3309
3310 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3311 {
3312         // TODO: IMPLEMENT
3313         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3316         memset(buffer_FragColorbgra8, 0, span->length*4);
3317         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3318 }
3319
3320
3321
3322 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3323 {
3324         DPSOFTRAST_VertexShader_Lightmap();
3325 }
3326
3327 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3328 {
3329         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3330         // TODO: IMPLEMENT
3331 }
3332
3333
3334
3335 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3336 {
3337         DPSOFTRAST_VertexShader_Lightmap();
3338 }
3339
3340 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3341 {
3342         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3343         // TODO: IMPLEMENT
3344 }
3345
3346
3347
3348 void DPSOFTRAST_VertexShader_LightDirection(void)
3349 {
3350         int i;
3351         int numvertices = dpsoftrast.numvertices;
3352         float LightDir[4];
3353         float LightVector[4];
3354         float EyePosition[4];
3355         float EyeVectorModelSpace[4];
3356         float EyeVector[4];
3357         float position[4];
3358         float svector[4];
3359         float tvector[4];
3360         float normal[4];
3361         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3362         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3363         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3364         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3365         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3366         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3367         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3368         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3369         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3370         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3371         for (i = 0;i < numvertices;i++)
3372         {
3373                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3374                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3375                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3376                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3377                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3378                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3379                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3380                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3381                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3382                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3383                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3384                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3385                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3386                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3387                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3388                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3389                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3390                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3391                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3392                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3393                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3394                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3395                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3396                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3397                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3398                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3399                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3400                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3401                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3402         }
3403 }
3404
3405 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3406 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3407 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3408 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3409 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3410 #define DPSOFTRAST_Vector3Normalize(v)\
3411 do\
3412 {\
3413         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3414         if (len)\
3415         {\
3416                 len = 1.0f / len;\
3417                 v[0] *= len;\
3418                 v[1] *= len;\
3419                 v[2] *= len;\
3420         }\
3421 }\
3422 while(0)
3423
3424 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3425 {
3426         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3427         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3428         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3429         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3432         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434         int x, startx = span->startx, endx = span->endx;
3435         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3436         float LightVectordata[4];
3437         float LightVectorslope[4];
3438         float EyeVectordata[4];
3439         float EyeVectorslope[4];
3440         float z;
3441         float diffusetex[4];
3442         float glosstex[4];
3443         float surfacenormal[4];
3444         float lightnormal[4];
3445         float eyenormal[4];
3446         float specularnormal[4];
3447         float diffuse;
3448         float specular;
3449         float SpecularPower;
3450         int d[4];
3451         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3452         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3453         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3454         Color_Glow[3] = 0.0f;
3455         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3456         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3457         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3458         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3459         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3460         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3461         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3462         Color_Pants[3] = 0.0f;
3463         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3464         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3465         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3466         Color_Shirt[3] = 0.0f;
3467         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3468         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3469         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3470         {
3471                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3472                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3473         }
3474         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3475         {
3476                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3477         }
3478         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3479         {
3480                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3481                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3482                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3483                 Color_Diffuse[3] = 0.0f;
3484                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3485                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3486                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3487                 LightColor[3] = 0.0f;
3488                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3489                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3490                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3491                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3492                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3493                 Color_Specular[3] = 0.0f;
3494                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3495                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3496                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3497                 for (x = startx;x < endx;x++)
3498                 {
3499                         z = buffer_z[x];
3500                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3501                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3502                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3503                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3504                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3505                         {
3506                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3507                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3508                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3509                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3510                         }
3511                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3512                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3513                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3514                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3515                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3516                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3517                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3518                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3519
3520                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3521                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3522                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3523                         DPSOFTRAST_Vector3Normalize(lightnormal);
3524
3525                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3526                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3527                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3528                         DPSOFTRAST_Vector3Normalize(eyenormal);
3529
3530                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3531                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3532                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3533                         DPSOFTRAST_Vector3Normalize(specularnormal);
3534
3535                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3536                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3537                         specular = pow(specular, SpecularPower * glosstex[3]);
3538                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3539                         {
3540                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3541                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3542                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3543                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3544                         }
3545                         else
3546                         {
3547                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3548                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3549                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3550                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3551                         }
3552                         buffer_FragColorbgra8[x*4+0] = d[0];
3553                         buffer_FragColorbgra8[x*4+1] = d[1];
3554                         buffer_FragColorbgra8[x*4+2] = d[2];
3555                         buffer_FragColorbgra8[x*4+3] = d[3];
3556                 }
3557         }
3558         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3559         {
3560                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3561                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3562                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3563                 Color_Diffuse[3] = 0.0f;
3564                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3565                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3566                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3567                 LightColor[3] = 0.0f;
3568                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3569                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3570                 for (x = startx;x < endx;x++)
3571                 {
3572                         z = buffer_z[x];
3573                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3574                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3575                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3576                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3577                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3578                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3579                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3580                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3581
3582                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3583                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3584                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3585                         DPSOFTRAST_Vector3Normalize(lightnormal);
3586
3587                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3588                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3589                         {
3590                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3591                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3592                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3593                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3594                         }
3595                         else
3596                         {
3597                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3598                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3599                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3600                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3601                         }
3602                         buffer_FragColorbgra8[x*4+0] = d[0];
3603                         buffer_FragColorbgra8[x*4+1] = d[1];
3604                         buffer_FragColorbgra8[x*4+2] = d[2];
3605                         buffer_FragColorbgra8[x*4+3] = d[3];
3606                 }
3607         }
3608         else
3609         {
3610                 for (x = startx;x < endx;x++)
3611                 {
3612                         z = buffer_z[x];
3613                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3614                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3615                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3616                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3617
3618                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3619                         {
3620                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3621                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3622                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3623                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3624                         }
3625                         else
3626                         {
3627                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3628                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3629                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3630                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3631                         }
3632                         buffer_FragColorbgra8[x*4+0] = d[0];
3633                         buffer_FragColorbgra8[x*4+1] = d[1];
3634                         buffer_FragColorbgra8[x*4+2] = d[2];
3635                         buffer_FragColorbgra8[x*4+3] = d[3];
3636                 }
3637         }
3638         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3639 }
3640
3641
3642
3643 void DPSOFTRAST_VertexShader_LightSource(void)
3644 {
3645         int i;
3646         int numvertices = dpsoftrast.numvertices;
3647         float LightPosition[4];
3648         float LightVector[4];
3649         float LightVectorModelSpace[4];
3650         float EyePosition[4];
3651         float EyeVectorModelSpace[4];
3652         float EyeVector[4];
3653         float position[4];
3654         float svector[4];
3655         float tvector[4];
3656         float normal[4];
3657         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3658         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3659         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3660         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3661         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3662         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3663         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3664         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3665         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3666         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD0], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3667         DPSOFTRAST_Array_Transform(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3668         DPSOFTRAST_Array_Copy(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD4], dpsoftrast.numvertices);
3669         for (i = 0;i < numvertices;i++)
3670         {
3671                 position[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3672                 position[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3673                 position[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3674                 svector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3675                 svector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3676                 svector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3677                 tvector[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3678                 tvector[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3679                 tvector[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3680                 normal[0] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3681                 normal[1] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3682                 normal[2] = dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3683                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3684                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3685                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3686                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3687                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3688                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3689                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3690                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3691                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3692                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3693                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3694                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3695                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3696                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3697                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3698                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3699                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3700                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3701                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3702                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3703         }
3704 }
3705
3706 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3707 {
3708 #ifdef SSE2_PRESENT
3709         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3710         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717         int x, startx = span->startx, endx = span->endx;
3718         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3719         float CubeVectordata[4];
3720         float CubeVectorslope[4];
3721         float LightVectordata[4];
3722         float LightVectorslope[4];
3723         float EyeVectordata[4];
3724         float EyeVectorslope[4];
3725         float z;
3726         float diffusetex[4];
3727         float glosstex[4];
3728         float surfacenormal[4];
3729         float lightnormal[4];
3730         float eyenormal[4];
3731         float specularnormal[4];
3732         float diffuse;
3733         float specular;
3734         float SpecularPower;
3735         float CubeVector[4];
3736         float attenuation;
3737         int d[4];
3738         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3739         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3740         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3741         Color_Glow[3] = 0.0f;
3742         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3743         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3744         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3745         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3746         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3747         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3748         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3749         Color_Diffuse[3] = 0.0f;
3750         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3751         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3752         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3753         Color_Specular[3] = 0.0f;
3754         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3755         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3756         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3757         Color_Pants[3] = 0.0f;
3758         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3759         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3760         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3761         Color_Shirt[3] = 0.0f;
3762         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3763         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3764         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3765         LightColor[3] = 0.0f;
3766         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3767         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3768         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3769         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3770         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3771         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3772         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3773         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3774         {
3775                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3776                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3777         }
3778         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3779                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3780         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3781         {
3782                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3783                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3784                 for (x = startx;x < endx;x++)
3785                 {
3786                         z = buffer_z[x];
3787                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3788                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3789                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3790                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3791                         if (attenuation < 0.01f)
3792                                 continue;
3793                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3794                         {
3795                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3796                                 if (attenuation < 0.01f)
3797                                         continue;
3798                         }
3799
3800                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3801                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3802                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3803                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3804                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3805                         {
3806                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3807                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3808                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3809                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3810                         }
3811                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3812                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3813                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3814                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3815                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3816                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3817                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3818                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3819
3820                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3821                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3822                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3823                         DPSOFTRAST_Vector3Normalize(lightnormal);
3824
3825                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3826                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3827                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3828                         DPSOFTRAST_Vector3Normalize(eyenormal);
3829
3830                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3831                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3832                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3833                         DPSOFTRAST_Vector3Normalize(specularnormal);
3834
3835                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3836                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3837                         specular = pow(specular, SpecularPower * glosstex[3]);
3838                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3839                         {
3840                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3841                                 attenuation *= (1.0f / 255.0f);
3842                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3843                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3844                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3845                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3846                         }
3847                         else
3848                         {
3849                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3850                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3851                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3852                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3853                         }
3854                         buffer_FragColorbgra8[x*4+0] = d[0];
3855                         buffer_FragColorbgra8[x*4+1] = d[1];
3856                         buffer_FragColorbgra8[x*4+2] = d[2];
3857                         buffer_FragColorbgra8[x*4+3] = d[3];
3858                 }
3859         }
3860         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3861         {
3862                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3863                 for (x = startx;x < endx;x++)
3864                 {
3865                         z = buffer_z[x];
3866                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3867                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3868                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3869                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3870                         if (attenuation < 0.01f)
3871                                 continue;
3872                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3873                         {
3874                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3875                                 if (attenuation < 0.01f)
3876                                         continue;
3877                         }
3878
3879                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3880                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3881                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3882                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3883                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3884                         {
3885                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3886                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3887                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3888                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3889                         }
3890                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3891                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3892                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3893                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3894
3895                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3896                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3897                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3898                         DPSOFTRAST_Vector3Normalize(lightnormal);
3899
3900                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3901                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3902                         {
3903                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3904                                 attenuation *= (1.0f / 255.0f);
3905                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3906                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3907                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3908                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3909                         }
3910                         else
3911                         {
3912                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3913                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3914                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3915                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3916                         }
3917                         buffer_FragColorbgra8[x*4+0] = d[0];
3918                         buffer_FragColorbgra8[x*4+1] = d[1];
3919                         buffer_FragColorbgra8[x*4+2] = d[2];
3920                         buffer_FragColorbgra8[x*4+3] = d[3];
3921                 }
3922         }
3923         else
3924         {
3925                 for (x = startx;x < endx;x++)
3926                 {
3927                         z = buffer_z[x];
3928                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932                         if (attenuation < 0.01f)
3933                                 continue;
3934                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3935                         {
3936                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937                                 if (attenuation < 0.01f)
3938                                         continue;
3939                         }
3940
3941                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3946                         {
3947                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3951                         }
3952                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3953                         {
3954                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3955                                 attenuation *= (1.0f / 255.0f);
3956                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3957                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3958                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3959                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
3960                         }
3961                         else
3962                         {
3963                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3964                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3965                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3966                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3967                         }
3968                         buffer_FragColorbgra8[x*4+0] = d[0];
3969                         buffer_FragColorbgra8[x*4+1] = d[1];
3970                         buffer_FragColorbgra8[x*4+2] = d[2];
3971                         buffer_FragColorbgra8[x*4+3] = d[3];
3972                 }
3973         }
3974         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3975 #endif
3976 }
3977
3978
3979
3980 void DPSOFTRAST_VertexShader_Refraction(void)
3981 {
3982         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3983 }
3984
3985 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3986 {
3987         // TODO: IMPLEMENT
3988         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3989         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3990         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3991         memset(buffer_FragColorbgra8, 0, span->length*4);
3992         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3993 }
3994
3995
3996
3997 void DPSOFTRAST_VertexShader_Water(void)
3998 {
3999         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4000 }
4001
4002
4003 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4004 {
4005         // TODO: IMPLEMENT
4006         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4007         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4008         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4009         memset(buffer_FragColorbgra8, 0, span->length*4);
4010         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4011 }
4012
4013
4014
4015 void DPSOFTRAST_VertexShader_ShowDepth(void)
4016 {
4017         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4018 }
4019
4020 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4021 {
4022         // TODO: IMPLEMENT
4023         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4024         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4025         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4026         memset(buffer_FragColorbgra8, 0, span->length*4);
4027         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4028 }
4029
4030
4031
4032 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4033 {
4034         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4035 }
4036
4037 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4038 {
4039         // TODO: IMPLEMENT
4040         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4041         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4042         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4043         memset(buffer_FragColorbgra8, 0, span->length*4);
4044         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4045 }
4046
4047
4048
4049 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4050 {
4051         DPSOFTRAST_Array_TransformProject(dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.screencoord4f, dpsoftrast.in_array4f[DPSOFTRAST_ARRAY_POSITION], dpsoftrast.numvertices, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4052 }
4053
4054 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4055 {
4056         // TODO: IMPLEMENT
4057         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4058         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4060         memset(buffer_FragColorbgra8, 0, span->length*4);
4061         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4062 }
4063
4064
4065
4066 typedef struct DPSOFTRAST_ShaderModeInfo_s
4067 {
4068         int lodarrayindex;
4069         void (*Vertex)(void);
4070         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4071         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4072         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4073 }
4074 DPSOFTRAST_ShaderModeInfo;
4075
4076 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4077 {
4078         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4079         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4080         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4081         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4082         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4083         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4084         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4085         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4086         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4087         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4088         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4089         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4090         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4091         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4092         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4093         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4094 };
4095
4096
4097 int DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int commandoffset, int endoffset)
4098 {
4099         while (commandoffset != endoffset)
4100         {
4101                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4102                 switch (command->opcode)
4103                 {
4104 #define INTERPCOMMAND(name) \
4105                 case DPSOFTRAST_OPCODE_##name : \
4106                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4107                         commandoffset += sizeof( DPSOFTRAST_Command_##name ) + ((COMMAND_SIZE - (sizeof( DPSOFTRAST_Command_##name )&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)); \
4108                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4109                                 commandoffset = 0; \
4110                         break;
4111                 INTERPCOMMAND(Viewport)
4112                 INTERPCOMMAND(ClearColor)
4113                 INTERPCOMMAND(ClearDepth)
4114                 INTERPCOMMAND(ColorMask)
4115                 INTERPCOMMAND(DepthTest)
4116                 INTERPCOMMAND(ScissorTest)
4117                 INTERPCOMMAND(Scissor)
4118                 INTERPCOMMAND(BlendFunc)
4119                 INTERPCOMMAND(BlendSubtract)
4120                 INTERPCOMMAND(DepthMask)
4121                 INTERPCOMMAND(DepthFunc)
4122                 INTERPCOMMAND(DepthRange)
4123                 INTERPCOMMAND(PolygonOffset)
4124                 INTERPCOMMAND(AlphaTest)
4125                 INTERPCOMMAND(AlphaFunc)
4126                 INTERPCOMMAND(SetTexture)
4127                 INTERPCOMMAND(SetShader)
4128                 INTERPCOMMAND(Uniform4f)
4129                 INTERPCOMMAND(UniformMatrix4f)
4130                 INTERPCOMMAND(Uniform1i)
4131
4132                 case DPSOFTRAST_OPCODE_Reset:
4133                         commandoffset = 0;
4134                         break;
4135                 }
4136         }
4137         return commandoffset;
4138 }
4139                                         
4140 int DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread, int commandoffset)
4141 {
4142         int i;
4143         int x;
4144         int startx;
4145         int endx;
4146 //      unsigned int c;
4147 //      unsigned int *colorpixel;
4148         unsigned int *depthpixel;
4149         float w;
4150         float wslope;
4151         int depth;
4152         int depthslope;
4153         unsigned int d;
4154         DPSOFTRAST_State_Triangle *triangle;
4155         DPSOFTRAST_State_Span *span;
4156         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157         for (i = 0; i < thread->numspans; i++)
4158         {
4159                 span = &thread->spans[i];
4160                 triangle = &dpsoftrast.trianglepool.triangles[span->triangle];
4161                 if (commandoffset != triangle->commandoffset)
4162                 {
4163                         commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4164                         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4165                 }
4166                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4167                 {
4168                         wslope = triangle->w[0];
4169                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4170                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4171                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4172                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4173                         switch(thread->fb_depthfunc)
4174                         {
4175                         default:
4176                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4177                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4178                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4179                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4180                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4181                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4182                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4183                         }
4184                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4185                         //for (x = 0;x < span->length;x++)
4186                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4187                         // if there is no color buffer, skip pixel shader
4188                         startx = 0;
4189                         endx = span->length;
4190                         while (startx < endx && !pixelmask[startx])
4191                                 startx++;
4192                         while (endx > startx && !pixelmask[endx-1])
4193                                 endx--;
4194                         if (startx >= endx)
4195                                 continue; // no pixels to fill
4196                         span->pixelmask = pixelmask;
4197                         span->startx = startx;
4198                         span->endx = endx;
4199                         // run pixel shader if appropriate
4200                         // do this before running depthmask code, to allow the pixelshader
4201                         // to clear pixelmask values for alpha testing
4202                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4203                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4204                         if (thread->depthmask)
4205                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4206                                         if (pixelmask[x])
4207                                                 depthpixel[x] = d;
4208                 }
4209                 else
4210                 {
4211                         // no depth testing means we're just dealing with color...
4212                         // if there is no color buffer, skip pixel shader
4213                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4214                         {
4215                                 memset(pixelmask, 1, span->length);
4216                                 span->pixelmask = pixelmask;
4217                                 span->startx = 0;
4218                                 span->endx = span->length;
4219                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4220                         }
4221                 }
4222         }
4223         thread->numspans = 0;
4224         return commandoffset;
4225 }
4226
4227 void DPSOFTRAST_Draw_GenerateSpans(DPSOFTRAST_State_Thread *thread, int freetriangle)
4228 {
4229 #ifdef SSE2_PRESENT
4230         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4231         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4232         int commandoffset = thread->commandoffset;
4233         int triangleoffset = thread->triangleoffset;
4234         DPSOFTRAST_State_Triangle *triangle = NULL;
4235         int starty;
4236         int endy;
4237         int y;
4238         int numpoints;
4239         __m128 coords[4];
4240         __m128i ycoords;
4241         while (triangleoffset != freetriangle)
4242         {
4243                 triangle = &dpsoftrast.trianglepool.triangles[triangleoffset];
4244                 if (++triangleoffset >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL)
4245                         triangleoffset = 0;
4246                 starty = triangle->starty + 1;
4247                 endy = triangle->endy;
4248                 if (starty >= maxy || endy <= miny)
4249                         continue;
4250                 numpoints = triangle->numpoints;
4251                 coords[0] = _mm_load_ps(triangle->coords[0]);
4252                 coords[1] = _mm_load_ps(triangle->coords[1]);
4253                 coords[2] = _mm_load_ps(triangle->coords[2]);
4254                 coords[3] = _mm_load_ps(triangle->coords[3]);
4255                 ycoords = _mm_load_si128((const __m128i *)triangle->ycoords);
4256                 if (starty < miny)
4257                         starty = miny;
4258                 if (endy > maxy)
4259                         endy = maxy;
4260                 for (y = starty; y < endy;)
4261                 {
4262                         __m128 xcoords, xslope;
4263                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), ycoords);
4264                         int yccmask = _mm_movemask_epi8(ycc);
4265                         int edge0p, edge0n, edge1p, edge1n;
4266                         int nexty;
4267                         if (numpoints == 4)
4268                         {
4269                                 switch(yccmask)
4270                                 {
4271                                 default:
4272                                 case 0xFFFF: /*0000*/ y = endy; continue;
4273                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4274                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4275                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4276                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4277                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4278                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4279                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4280                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4281                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4282                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4283                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4284                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4285                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4286                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4287                                 case 0x0000: /*1111*/ y++; continue;
4288                                 }
4289                         }
4290                         else
4291                         {
4292                                 switch(yccmask)
4293                                 {
4294                                 default:
4295                                 case 0xFFFF: /*000*/ y = endy; continue;
4296                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4297                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4298                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4299                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4300                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4301                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4302                                 case 0x0000: /*111*/ y++; continue;
4303                                 }
4304                         }
4305                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), ycoords);
4306                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4307                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4308                         nexty = _mm_extract_epi16(ycc, 0);
4309                         if(nexty >= endy) nexty = endy-1;
4310                         if (_mm_ucomigt_ss(_mm_max_ss(coords[edge0n], coords[edge0p]), _mm_min_ss(coords[edge1n], coords[edge1p])))
4311                         {
4312                                 int tmp = edge0n;
4313                                 edge0n = edge1n;
4314                                 edge1n = tmp;
4315                                 tmp = edge0p;
4316                                 edge0p = edge1p;
4317                                 edge1p = tmp;
4318                         }
4319                         xslope = _mm_sub_ps(_mm_movelh_ps(coords[edge0n], coords[edge1n]), _mm_movelh_ps(coords[edge0p], coords[edge1p]));
4320                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4321                         xcoords = _mm_add_ps(_mm_movelh_ps(coords[edge0p], coords[edge1p]),
4322                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(coords[edge0p], coords[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4323                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4324                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4325                         {
4326                                 int startx, endx, offset;
4327                                 startx = _mm_cvtss_si32(xcoords);
4328                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4329                                 if (startx < 0) startx = 0;
4330                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4331                                 if (startx >= endx) continue;
4332                                 for (offset = startx; offset < endx;)
4333                                 {
4334                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4335                                         span->triangle = (int)(triangle - dpsoftrast.trianglepool.triangles);
4336                                         span->x = offset;
4337                                         span->y = y;
4338                                         span->length = endx - offset;
4339                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4340                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4341                                         offset += span->length;
4342                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4343                                                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4344                                 }
4345                         }
4346                 }
4347         }
4348
4349         if (thread->numspans > 0)
4350                 commandoffset = DPSOFTRAST_Draw_ProcessSpans(thread, commandoffset);
4351         if (commandoffset != triangle->commandoffset)
4352         {
4353                 commandoffset = DPSOFTRAST_Draw_InterpretCommands(thread, commandoffset, triangle->commandoffset);
4354                 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4355         }
4356         
4357         MEMORY_BARRIER;
4358
4359         thread->commandoffset = commandoffset;
4360         thread->triangleoffset = triangleoffset;
4361 #endif
4362 }
4363
4364 void DPSOFTRAST_Draw_FlushThreads(void)
4365 {
4366         DPSOFTRAST_State_Thread *thread;
4367         int i;
4368         if(dpsoftrast.drawtriangle != dpsoftrast.trianglepool.freetriangle)
4369         {
4370                 MEMORY_BARRIER;
4371                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4372         }
4373 #ifdef USE_THREADS
4374         SDL_LockMutex(dpsoftrast.trianglemutex);
4375 #endif
4376         for (i = 0; i < dpsoftrast.numthreads; i++)
4377         {
4378                 thread = &dpsoftrast.threads[i];
4379 #ifdef USE_THREADS
4380                 while (thread->triangleoffset != dpsoftrast.drawtriangle)
4381                 {
4382                         thread->waiting = true;
4383                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4384                         SDL_CondWait(thread->waitcond, dpsoftrast.trianglemutex);
4385                         thread->waiting = false;
4386                 }
4387 #else
4388                 if (thread->triangleoffset != dpsoftrast.drawtriangle) 
4389                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle);
4390 #endif
4391         }
4392 #ifdef USE_THREADS
4393         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4394 #endif
4395         dpsoftrast.trianglepool.usedtriangles = 0;
4396         dpsoftrast.commandpool.usedcommands = 0;
4397 }
4398
4399 #ifdef USE_THREADS
4400 static int DPSOFTRAST_Draw_Thread(void *data)
4401 {
4402         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4403         while(thread->index >= 0)
4404         {
4405                 if (thread->triangleoffset != dpsoftrast.drawtriangle)
4406                 {
4407                         DPSOFTRAST_Draw_GenerateSpans(thread, dpsoftrast.drawtriangle); 
4408                 }
4409                 else 
4410                 {
4411                         SDL_LockMutex(dpsoftrast.trianglemutex);
4412                         if (thread->triangleoffset != dpsoftrast.drawtriangle)
4413                         {
4414                                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4415                                 continue;
4416                         }
4417                         if (thread->waiting) SDL_CondSignal(thread->waitcond);
4418                         SDL_CondWait(dpsoftrast.trianglecond, dpsoftrast.trianglemutex);
4419                         SDL_UnlockMutex(dpsoftrast.trianglemutex);
4420                 }
4421         }   
4422         return 0;
4423 }
4424 #endif
4425
4426 void DPSOFTRAST_Draw_ProcessTriangles(int firstvertex, int numtriangles, const int *element3i, const unsigned short *element3s, unsigned char *arraymask, int numarrays)
4427 {
4428 #ifdef SSE2_PRESENT
4429         int cullface = dpsoftrast.cullface;
4430         int width = dpsoftrast.fb_width;
4431         int height = dpsoftrast.fb_height;
4432         __m128i fbmax = _mm_sub_epi16(_mm_setr_epi16(width, height, width, height, width, height, width, height), _mm_set1_epi16(1));
4433         DPSOFTRAST_State_Triangle *triangle;
4434         int numqueued = 0;
4435         int i;
4436         int j;
4437         int k;
4438         int y;
4439         int e[3];
4440         __m128i screeny;
4441         int starty, endy;
4442         int numpoints;
4443         int clipcase;
4444         float clipdist[4];
4445         __m128 triangleedge1, triangleedge2, trianglenormal;
4446         __m128 clipfrac[3];
4447         __m128 screen[4];
4448         DPSOFTRAST_Texture *texture;
4449         screen[3] = _mm_setzero_ps();
4450         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps(); 
4451         for (i = 0;i < numtriangles;i++)
4452         {
4453                 // generate the 3 edges of this triangle
4454                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4455                 if (element3i)
4456                 {
4457                         e[0] = element3i[i*3+0] - firstvertex;
4458                         e[1] = element3i[i*3+1] - firstvertex;
4459                         e[2] = element3i[i*3+2] - firstvertex;
4460                 }
4461                 else if (element3s)
4462                 {
4463                         e[0] = element3s[i*3+0] - firstvertex;
4464                         e[1] = element3s[i*3+1] - firstvertex;
4465                         e[2] = element3s[i*3+2] - firstvertex;
4466                 }
4467                 else
4468                 {
4469                         e[0] = i*3+0;
4470                         e[1] = i*3+1;
4471                         e[2] = i*3+2;
4472                 }
4473
4474 #define SKIPBACKFACE \
4475                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4476                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4477                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4478                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4479                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4480                 switch(cullface) \
4481                 { \
4482                 case GL_BACK: \
4483                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4484                                 continue; \
4485                         break; \
4486                 case GL_FRONT: \
4487                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4488                                 continue; \
4489                         break; \
4490                 }
4491                         //trianglenormal = _mm_sub_ps(_mm_mul_ps(triangleedge[0], _mm_shuffle_ps(triangleedge[1], triangleedge[1], _MM_SHUFFLE(3, 0, 2, 1))),
4492                         //                                                _mm_mul_ps(_mm_shuffle_ps(triangleedge[0], triangleedge[0], _MM_SHUFFLE(3, 0, 2, 1)), triangleedge[1]));
4493                         //trianglenormal[2] = triangleedge[0][0] * triangleedge[1][1] - triangleedge[0][1] * triangleedge[1][0];
4494                         //trianglenormal[0] = triangleedge[0][1] * triangleedge[1][2] - triangleedge[0][2] * triangleedge[1][1];
4495                         //trianglenormal[1] = triangleedge[0][2] * triangleedge[1][0] - triangleedge[0][0] * triangleedge[1][2];
4496
4497                         // macros for clipping vertices
4498
4499 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4500                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4501                         { \
4502                                 __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[p2]*4]); \
4503                                 screen[k] = DPSOFTRAST_Draw_ProjectVertex(_mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1]))); \
4504                         }
4505 #define CLIPPEDVERTEXCOPY(k,p1) \
4506                         screen[k] = _mm_load_ps(&dpsoftrast.screencoord4f[e[p1]*4]);
4507
4508 #define GENATTRIBCOPY(j, attrib, p1) \
4509                 attrib = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]);
4510 #define GENATTRIBLERP(j, attrib, p1, p2) \
4511                 { \
4512                         __m128 v1 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p1]*4]), v2 = _mm_load_ps(&dpsoftrast.post_array4f[j][e[p2]*4]); \
4513                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4514                 }
4515 #define GENATTRIBS(j, attrib0, attrib1, attrib2) \
4516                 switch(clipcase) \
4517                 { \
4518                 default: \
4519                 case 0: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4520                 case 1: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4521                 case 2: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4522                 case 3: GENATTRIBCOPY(j, attrib0, 0); GENATTRIBLERP(j, attrib1, 0, 1); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4523                 case 4: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBCOPY(j, attrib2, 2); break; \
4524                 case 5: GENATTRIBLERP(j, attrib0, 0, 1); GENATTRIBCOPY(j, attrib1, 1); GENATTRIBLERP(j, attrib2, 1, 2); break; \
4525                 case 6: GENATTRIBLERP(j, attrib0, 1, 2); GENATTRIBCOPY(j, attrib1, 2); GENATTRIBLERP(j, attrib2, 2, 0); break; \
4526                 }
4527
4528                 // calculate distance from nearplane
4529                 clipdist[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[0]*4+3];
4530                 clipdist[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[1]*4+3];
4531                 clipdist[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+2] + dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][e[2]*4+3];
4532                 if (clipdist[0] >= 0.0f)
4533                 {
4534                         if (clipdist[1] >= 0.0f)
4535                         {
4536                                 if (clipdist[2] >= 0.0f)
4537                                 {
4538                                         // triangle is entirely in front of nearplane
4539                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4540                                         SKIPBACKFACE;
4541                                         numpoints = 3;
4542                                         clipcase = 0;
4543                                 }
4544                                 else
4545                                 {
4546                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4547                                         SKIPBACKFACE;
4548                                         numpoints = 4;
4549                                         clipcase = 1;
4550                                 }
4551                         }
4552                         else 
4553                         {
4554                                 if (clipdist[2] >= 0.0f)
4555                                 {
4556                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2);     CLIPPEDVERTEXCOPY(3,2);
4557                                         SKIPBACKFACE;
4558                                         numpoints = 4;
4559                                         clipcase = 2;
4560                                 }
4561                                 else
4562                                 {
4563                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4564                                         SKIPBACKFACE;
4565                                         numpoints = 3;
4566                                         clipcase = 3;
4567                                 }
4568                         }
4569                 }                       
4570                 else if (clipdist[1] >= 0.0f)
4571                 {
4572                         if (clipdist[2] >= 0.0f)
4573                         {
4574                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4575                                 SKIPBACKFACE;
4576                                 numpoints = 4;
4577                                 clipcase = 4;
4578                         }
4579                         else
4580                         {
4581                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4582                                 SKIPBACKFACE;
4583                                 numpoints = 3;
4584                                 clipcase = 5;
4585                         }
4586                 }
4587                 else if (clipdist[2] >= 0.0f)
4588                 {
4589                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4590                         SKIPBACKFACE;
4591                         numpoints = 3;
4592                         clipcase = 6;
4593                 }
4594                 else continue; // triangle is entirely behind nearplane
4595
4596                 {
4597                         // calculate integer y coords for triangle points
4598                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4599                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)), 
4600                                         screenmin = _mm_min_epi16(screeni, screenir), 
4601                                         screenmax = _mm_max_epi16(screeni, screenir);
4602                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4603                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4604                         screenmin = _mm_max_epi16(screenmin, _mm_setzero_si128());
4605                         screenmax = _mm_min_epi16(screenmax, fbmax);
4606                         // skip offscreen triangles
4607                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4608                                 continue;
4609                         starty = _mm_extract_epi16(screenmin, 1);
4610                         endy = _mm_extract_epi16(screenmax, 1)+1;
4611                         screeny = _mm_srai_epi32(screeni, 16);
4612                 }
4613
4614                 if (dpsoftrast.trianglepool.usedtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1)
4615 #ifdef USE_THREADS
4616                         DPSOFTRAST_Draw_FreeTrianglePool(DPSOFTRAST_DRAW_MAXTRIANGLEPOOL/8);
4617 #else
4618                         DPSOFTRAST_Draw_FlushThreads();
4619 #endif
4620
4621                 triangle = &dpsoftrast.trianglepool.triangles[dpsoftrast.trianglepool.freetriangle];
4622                 triangle->commandoffset = dpsoftrast.commandpool.freecommand;
4623                 triangle->starty = starty;
4624                 triangle->endy = endy;
4625                 triangle->numpoints = numpoints;
4626                 _mm_store_ps(triangle->coords[0], screen[0]);
4627                 _mm_store_ps(triangle->coords[1], screen[1]);
4628                 _mm_store_ps(triangle->coords[2], screen[2]);
4629                 _mm_store_ps(triangle->coords[3], numpoints > 3 ? screen[3] : screen[2]);
4630                 _mm_store_si128((__m128i *)triangle->ycoords, screeny);
4631
4632                 // calculate attribute plans for triangle data...
4633                 // okay, this triangle is going to produce spans, we'd better project
4634                 // the interpolants now (this is what gives perspective texturing),
4635                 // this consists of simply multiplying all arrays by the W coord
4636                 // (which is basically 1/Z), which will be undone per-pixel
4637                 // (multiplying by Z again) to get the perspective-correct array
4638                 // values
4639                 {
4640                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4641                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4642                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4643                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4644                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4645                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4646                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4647                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4648                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4649                         attribedge1 = _mm_sub_ss(w0, w1);
4650                         attribedge2 = _mm_sub_ss(w2, w1);
4651                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4652                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4653                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4654                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4655                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4656                         _mm_store_ss(&triangle->w[0], attribxslope);
4657                         _mm_store_ss(&triangle->w[1], attribyslope);
4658                         _mm_store_ss(&triangle->w[2], attriborigin);
4659                         for (j = 0;j < numarrays;j++)
4660                         {
4661                                 if (arraymask[j])
4662                                 {
4663                                         __m128 attrib0, attrib1, attrib2;
4664                                         GENATTRIBS(j, attrib0, attrib1, attrib2);
4665                                         attriborigin = _mm_mul_ps(attrib1, w1);
4666                                         attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4667                                         attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4668                                         attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4669                                         attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4670                                         attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4671                                         _mm_stream_ps(triangle->attribs[j][0], attribxslope);
4672                                         _mm_stream_ps(triangle->attribs[j][1], attribyslope);
4673                                         _mm_stream_ps(triangle->attribs[j][2], attriborigin);
4674                                 }
4675                         }
4676                 }
4677
4678                 // adjust texture LOD by texture density, in the simplest way possible...
4679                 {
4680                         __m128 mipedgescale, mipedgetc, mipdensity, attrib0, attrib1, attrib2;
4681                         memset(triangle->mip, 0, sizeof(triangle->mip));
4682                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4683                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4684                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4685                         k = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].lodarrayindex;
4686                         GENATTRIBS(k, attrib0, attrib1, attrib2);
4687                         mipedgetc = _mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1));
4688                         mipedgetc = _mm_mul_ps(mipedgetc, mipedgescale);
4689                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4690                         {
4691                                 int texunit = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].texunits[j];
4692                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4693                                         break;
4694                                 texture = dpsoftrast.texbound[texunit];
4695                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4696                                 {
4697                                         mipdensity = _mm_mul_ps(mipedgetc, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4698                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4699                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4700                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4701                                         // this will be multiplied in the texturing routine by the texture resolution
4702                                         y = _mm_cvtss_si32(mipdensity);
4703                                         if (y > 0)
4704                                         {
4705                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4706                                                 if (y > texture->mipmaps - 1)
4707                                                         y = texture->mipmaps - 1;
4708                                                 triangle->mip[texunit] = y;
4709                                         }
4710                                 }
4711                         }
4712                 }
4713
4714                 dpsoftrast.trianglepool.freetriangle = dpsoftrast.trianglepool.freetriangle < DPSOFTRAST_DRAW_MAXTRIANGLEPOOL-1 ? dpsoftrast.trianglepool.freetriangle + 1 : 0;
4715                 dpsoftrast.trianglepool.usedtriangles++;
4716
4717                 numqueued++;
4718                 if (numqueued >= DPSOFTRAST_DRAW_FLUSHPROCESSTRIANGLES)
4719                 {
4720                         MEMORY_BARRIER;
4721                         dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4722
4723 #ifdef USE_THREADS
4724                         //SDL_LockMutex(dpsoftrast.trianglemutex);
4725                         SDL_CondBroadcast(dpsoftrast.trianglecond);
4726                         //SDL_UnlockMutex(dpsoftrast.trianglemutex);
4727 #else
4728                         DPSOFTRAST_Draw_FlushThreads();
4729 #endif
4730                         numqueued = 0;
4731                 }
4732         }
4733         if (numqueued > 0)
4734         {
4735                 MEMORY_BARRIER;
4736                 dpsoftrast.drawtriangle = dpsoftrast.trianglepool.freetriangle;
4737
4738 #ifdef USE_THREADS
4739                 //SDL_LockMutex(dpsoftrast.trianglemutex);
4740                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4741                 //SDL_UnlockMutex(dpsoftrast.trianglemutex);
4742 #else
4743                 DPSOFTRAST_Draw_FlushThreads();
4744 #endif
4745         }
4746 #endif
4747 }
4748
4749 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4750 {
4751         int i;
4752         int lastarray = DPSOFTRAST_ARRAY_POSITION;
4753         unsigned char arraymask[DPSOFTRAST_ARRAY_TOTAL];
4754         memset(arraymask, false, sizeof(arraymask));
4755         arraymask[DPSOFTRAST_ARRAY_POSITION] = true;
4756         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4757         {
4758                 int arrayindex = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4759                 if (arrayindex >= DPSOFTRAST_ARRAY_TOTAL)
4760                         break;
4761                 switch (arrayindex)
4762                 {
4763                         case DPSOFTRAST_ARRAY_POSITION:
4764                         case DPSOFTRAST_ARRAY_COLOR: 
4765                                 break;
4766                         default:
4767                                 if (dpsoftrast.pointer_texcoordf[arrayindex-DPSOFTRAST_ARRAY_TEXCOORD0] == NULL)
4768                                         continue;
4769                                 break;
4770                 }
4771                 arraymask[arrayindex] = true;
4772                 if (arrayindex > lastarray)
4773                         lastarray = arrayindex;
4774         }
4775         DPSOFTRAST_Draw_LoadVertices(firstvertex, numvertices, arraymask[DPSOFTRAST_ARRAY_COLOR]);
4776         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4777 //      DPSOFTRAST_Draw_ProjectVertices(dpsoftrast.screencoord4f, dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION], numvertices);
4778         DPSOFTRAST_Draw_ProcessTriangles(firstvertex, numtriangles, element3i, element3s, arraymask, lastarray+1);
4779 }
4780
4781 void DPSOFTRAST_Flush(void)
4782 {
4783         DPSOFTRAST_Draw_SyncCommands();
4784         DPSOFTRAST_Draw_FlushThreads();
4785 }
4786
4787 void DPSOFTRAST_Finish(void)
4788 {
4789         DPSOFTRAST_Flush();
4790 }
4791
4792 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4793 {
4794         int i;
4795         union
4796         {
4797                 int i;
4798                 unsigned char b[4];
4799         }
4800         u;
4801         u.i = 1;
4802         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4803         dpsoftrast.bigendian = u.b[3];
4804         dpsoftrast.fb_width = width;
4805         dpsoftrast.fb_height = height;
4806         dpsoftrast.fb_depthpixels = depthpixels;
4807         dpsoftrast.fb_colorpixels[0] = colorpixels;
4808         dpsoftrast.fb_colorpixels[1] = NULL;
4809         dpsoftrast.fb_colorpixels[1] = NULL;
4810         dpsoftrast.fb_colorpixels[1] = NULL;
4811         dpsoftrast.texture_firstfree = 1;
4812         dpsoftrast.texture_end = 1;
4813         dpsoftrast.texture_max = 0;
4814         dpsoftrast.viewport[0] = 0;
4815         dpsoftrast.viewport[1] = 0;
4816         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4817         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4818         dpsoftrast.color[0] = 1;
4819         dpsoftrast.color[1] = 1;
4820         dpsoftrast.color[2] = 1;
4821         dpsoftrast.color[3] = 1;
4822         dpsoftrast.cullface = GL_BACK;
4823 #ifdef USE_THREADS
4824         dpsoftrast.numthreads = bound(1, numthreads, 64);
4825         dpsoftrast.trianglemutex = SDL_CreateMutex();
4826         dpsoftrast.trianglecond = SDL_CreateCond();
4827 #else
4828         dpsoftrast.numthreads = 1;
4829 #endif
4830         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4831         for (i = 0; i < dpsoftrast.numthreads; i++)
4832         {
4833                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4834                 thread->index = i;
4835                 thread->colormask[1] = 1;
4836                 thread->colormask[2] = 1;
4837                 thread->colormask[3] = 1;
4838                 thread->blendfunc[0] = GL_ONE;
4839                 thread->blendfunc[1] = GL_ZERO;
4840                 thread->depthmask = true;
4841                 thread->depthtest = true;
4842                 thread->depthfunc = GL_LEQUAL;
4843                 thread->scissortest = false;
4844                 thread->alphatest = false;
4845                 thread->alphafunc = GL_GREATER;
4846                 thread->alphavalue = 0.5f;
4847                 thread->scissor[0] = 0;
4848                 thread->scissor[1] = 0;
4849                 thread->scissor[2] = dpsoftrast.fb_width;
4850                 thread->scissor[3] = dpsoftrast.fb_height;
4851                 thread->depthrange[0] = 0;
4852                 thread->depthrange[1] = 1;
4853                 thread->polygonoffset[0] = 0;
4854                 thread->polygonoffset[1] = 0;
4855
4856                 thread->numspans = 0;
4857                 thread->triangleoffset = 0;
4858                 thread->commandoffset = 0;
4859                 thread->waiting = false;
4860 #ifdef USE_THREADS
4861                 thread->waitcond = SDL_CreateCond();
4862 #endif
4863
4864                 thread->validate = -1;
4865                 DPSOFTRAST_Validate(thread, -1);
4866 #ifdef USE_THREADS
4867                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4868 #endif
4869         }
4870 }
4871
4872 void DPSOFTRAST_Shutdown(void)
4873 {
4874         int i;
4875 #ifdef USE_THREADS
4876         if(dpsoftrast.numthreads > 0)
4877         {
4878                 DPSOFTRAST_State_Thread *thread;
4879                 SDL_LockMutex(dpsoftrast.trianglemutex);
4880                 for (i = 0; i < dpsoftrast.numthreads; i++)
4881                 {
4882                         thread = &dpsoftrast.threads[i];
4883                         thread->index = -1;
4884                 }
4885                 SDL_CondBroadcast(dpsoftrast.trianglecond);
4886                 SDL_UnlockMutex(dpsoftrast.trianglemutex);
4887                 for (i = 0; i < dpsoftrast.numthreads; i++)
4888                 {
4889                         thread = &dpsoftrast.threads[i];
4890                         SDL_WaitThread(thread->thread, NULL);
4891                         SDL_DestroyCond(thread->waitcond);
4892                 }
4893                 SDL_DestroyMutex(dpsoftrast.trianglemutex);
4894                 SDL_DestroyCond(dpsoftrast.trianglecond);
4895         }
4896 #endif
4897         for (i = 0;i < dpsoftrast.texture_end;i++)
4898                 if (dpsoftrast.texture[i].bytes)
4899                         MM_FREE(dpsoftrast.texture[i].bytes);
4900         if (dpsoftrast.texture)
4901                 free(dpsoftrast.texture);
4902         if (dpsoftrast.threads)
4903                 MM_FREE(dpsoftrast.threads);
4904         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4905 }
4906