]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix some redundant shuffles
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237
238         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239         
240         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242
243         // DPSOFTRAST_VALIDATE_ flags
244         int validate;
245
246         // derived values (DPSOFTRAST_VALIDATE_FB)
247         int fb_colormask;
248         int fb_scissor[4];
249         ALIGN(float fb_viewportcenter[4]);
250         ALIGN(float fb_viewportscale[4]);
251
252         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253         int fb_depthfunc;
254
255         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256         int fb_blendmode;
257
258         // band boundaries
259         int miny1;
260         int maxy1;
261         int miny2;
262         int maxy2;
263
264         ATOMIC(volatile int commandoffset);
265
266         volatile bool waiting;
267         volatile bool starving;
268         void *waitcond;
269         void *drawcond;
270         void *drawmutex;
271
272         int numspans;
273         int numtriangles;
274         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
276 }
277 DPSOFTRAST_State_Thread);
278
279 typedef ATOMIC(struct DPSOFTRAST_State_s
280 {
281         int fb_width;
282         int fb_height;
283         unsigned int *fb_depthpixels;
284         unsigned int *fb_colorpixels[4];
285
286         int viewport[4];
287         ALIGN(float fb_viewportcenter[4]);
288         ALIGN(float fb_viewportscale[4]);
289
290         float color[4];
291         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
293
294         const float *pointer_vertex3f;
295         const float *pointer_color4f;
296         const unsigned char *pointer_color4ub;
297         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298         int stride_vertex;
299         int stride_color;
300         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
303
304         int firstvertex;
305         int numvertices;
306         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307         float *screencoord4f;
308         int drawstarty;
309         int drawendy;
310         int drawclipped;
311         
312         int shader_mode;
313         int shader_permutation;
314
315         int texture_max;
316         int texture_end;
317         int texture_firstfree;
318         DPSOFTRAST_Texture *texture;
319
320         int bigendian;
321
322         // error reporting
323         const char *errorstring;
324
325         bool usethreads;
326         int interlace;
327         int numthreads;
328         DPSOFTRAST_State_Thread *threads;
329
330         ATOMIC(volatile int drawcommand);
331
332         DPSOFTRAST_State_Command_Pool commandpool;
333 }
334 DPSOFTRAST_State);
335
336 DPSOFTRAST_State dpsoftrast;
337
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
343
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
345 {
346         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348         fb_viewportcenter[3] = 0.5f;
349         fb_viewportcenter[0] = 0.0f;
350         fb_viewportscale[1] = 0.5f * viewport[2];
351         fb_viewportscale[2] = -0.5f * viewport[3];
352         fb_viewportscale[3] = 0.5f;
353         fb_viewportscale[0] = 1.0f;
354 }
355
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
357 {
358         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359         // and viewport projection values
360         int x1, x2;
361         int y1, y2;
362         x1 = thread->scissor[0];
363         x2 = thread->scissor[0] + thread->scissor[2];
364         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365         y2 = dpsoftrast.fb_height - thread->scissor[1];
366         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
367         if (x1 < 0) x1 = 0;
368         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
369         if (y1 < 0) y1 = 0;
370         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371         thread->fb_scissor[0] = x1;
372         thread->fb_scissor[1] = y1;
373         thread->fb_scissor[2] = x2 - x1;
374         thread->fb_scissor[3] = y2 - y1;
375
376         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
377 }
378
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
380 {
381         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
382 }
383
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
385 {
386         if (thread->blendsubtract)
387         {
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
394                 }
395         }
396         else
397         {       
398                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
399                 {
400                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
411                 }
412         }
413 }
414
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
416
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
418 {
419         mask &= thread->validate;
420         if (!mask)
421                 return;
422         if (mask & DPSOFTRAST_VALIDATE_FB)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425                 DPSOFTRAST_RecalcFB(thread);
426         }
427         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
428         {
429                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430                 DPSOFTRAST_RecalcDepthFunc(thread);
431         }
432         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
433         {
434                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435                 DPSOFTRAST_RecalcBlendFunc(thread);
436         }
437 }
438
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
440 {
441         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442                 return &dpsoftrast.texture[index];
443         return NULL;
444 }
445
446 static void DPSOFTRAST_Texture_Grow(void)
447 {
448         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449         DPSOFTRAST_State_Thread *thread;
450         int i;
451         int j;
452         DPSOFTRAST_Flush();
453         // expand texture array as needed
454         if (dpsoftrast.texture_max < 1024)
455                 dpsoftrast.texture_max = 1024;
456         else
457                 dpsoftrast.texture_max *= 2;
458         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460                 if (dpsoftrast.texbound[i])
461                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462         for (j = 0; j < dpsoftrast.numthreads; j++)
463         {
464                 thread = &dpsoftrast.threads[j];
465                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466                         if (thread->texbound[i])
467                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468         }
469 }
470
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
472 {
473         int w;
474         int h;
475         int d;
476         int size;
477         int s;
478         int texnum;
479         int mipmaps;
480         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482         DPSOFTRAST_Texture *texture;
483         if (width*height*depth < 1)
484         {
485                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
486                 return 0;
487         }
488         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
489         {
490                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
491                 return 0;
492         }
493         switch(texformat)
494         {
495         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
498                 break;
499         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503                         return 0;
504                 }
505                 if (depth != 1)
506                 {
507                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
508                         return 0;
509                 }
510                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
511                 {
512                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
513                         return 0;
514                 }
515                 break;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525                 return 0;
526         }
527         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
530                 return 0;
531         }
532         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
533         {
534                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
535                 return 0;
536         }
537         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
538         {
539                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
540                 return 0;
541         }
542         // find first empty slot in texture array
543         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544                 if (!dpsoftrast.texture[texnum].bytes)
545                         break;
546         dpsoftrast.texture_firstfree = texnum + 1;
547         if (dpsoftrast.texture_max <= texnum)
548                 DPSOFTRAST_Texture_Grow();
549         if (dpsoftrast.texture_end <= texnum)
550                 dpsoftrast.texture_end = texnum + 1;
551         texture = &dpsoftrast.texture[texnum];
552         memset(texture, 0, sizeof(*texture));
553         texture->flags = flags;
554         texture->width = width;
555         texture->height = height;
556         texture->depth = depth;
557         texture->sides = sides;
558         texture->binds = 0;
559         w = width;
560         h = height;
561         d = depth;
562         size = 0;
563         mipmaps = 0;
564         w = width;
565         h = height;
566         d = depth;
567         for (;;)
568         {
569                 s = w * h * d * sides * 4;
570                 texture->mipmap[mipmaps][0] = size;
571                 texture->mipmap[mipmaps][1] = s;
572                 texture->mipmap[mipmaps][2] = w;
573                 texture->mipmap[mipmaps][3] = h;
574                 texture->mipmap[mipmaps][4] = d;
575                 size += s;
576                 mipmaps++;
577                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578                         break;
579                 if (w > 1) w >>= 1;
580                 if (h > 1) h >>= 1;
581                 if (d > 1) d >>= 1;
582         }
583         texture->mipmaps = mipmaps;
584         texture->size = size;
585
586         // allocate the pixels now
587         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588
589         return texnum;
590 }
591 void DPSOFTRAST_Texture_Free(int index)
592 {
593         DPSOFTRAST_Texture *texture;
594         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595         if (texture->binds)
596                 DPSOFTRAST_Flush();
597         if (texture->bytes)
598                 MM_FREE(texture->bytes);
599         texture->bytes = NULL;
600         memset(texture, 0, sizeof(*texture));
601         // adjust the free range and used range
602         if (dpsoftrast.texture_firstfree > index)
603                 dpsoftrast.texture_firstfree = index;
604         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605                 dpsoftrast.texture_end--;
606 }
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
608 {
609         int i, x, y, z, w, layer0, layer1, row0, row1;
610         unsigned char *o, *i0, *i1, *i2, *i3;
611         DPSOFTRAST_Texture *texture;
612         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613         if (texture->mipmaps <= 1)
614                 return;
615         for (i = 1;i < texture->mipmaps;i++)
616         {
617                 for (z = 0;z < texture->mipmap[i][4];z++)
618                 {
619                         layer0 = z*2;
620                         layer1 = z*2+1;
621                         if (layer1 >= texture->mipmap[i-1][4])
622                                 layer1 = texture->mipmap[i-1][4]-1;
623                         for (y = 0;y < texture->mipmap[i][3];y++)
624                         {
625                                 row0 = y*2;
626                                 row1 = y*2+1;
627                                 if (row1 >= texture->mipmap[i-1][3])
628                                         row1 = texture->mipmap[i-1][3]-1;
629                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
630                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634                                 w = texture->mipmap[i][2];
635                                 if (layer1 > layer0)
636                                 {
637                                         if (texture->mipmap[i-1][2] > 1)
638                                         {
639                                                 // average 3D texture
640                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
641                                                 {
642                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
646                                                 }
647                                         }
648                                         else
649                                         {
650                                                 // average 3D mipmap with parent width == 1
651                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
652                                                 {
653                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
657                                                 }
658                                         }
659                                 }
660                                 else
661                                 {
662                                         if (texture->mipmap[i-1][2] > 1)
663                                         {
664                                                 // average 2D texture (common case)
665                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
666                                                 {
667                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
671                                                 }
672                                         }
673                                         else
674                                         {
675                                                 // 2D texture with parent width == 1
676                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
677                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
678                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
679                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
680                                         }
681                                 }
682                         }
683                 }
684         }
685 }
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
687 {
688         DPSOFTRAST_Texture *texture;
689         unsigned char *dst;
690         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
691         if (texture->binds)
692                 DPSOFTRAST_Flush();
693         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694         while (blockheight > 0)
695         {
696                 memcpy(dst, pixels, blockwidth * 4);
697                 pixels += blockwidth * 4;
698                 dst += texture->mipmap[0][2] * 4;
699                 blockheight--;
700         }
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
707         if (texture->binds)
708                 DPSOFTRAST_Flush();
709         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710         DPSOFTRAST_Texture_CalculateMipmaps(index);
711 }
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
713 {
714         DPSOFTRAST_Texture *texture;
715         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716         return texture->mipmap[mip][2];
717 }
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
719 {
720         DPSOFTRAST_Texture *texture;
721         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722         return texture->mipmap[mip][3];
723 }
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728         return texture->mipmap[mip][4];
729 }
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         return texture->bytes + texture->mipmap[mip][0];
737 }
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
739 {
740         DPSOFTRAST_Texture *texture;
741         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
743         {
744                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
745                 return;
746         }
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         texture->filter = filter;
750 }
751
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
753 {
754         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
757                 DPSOFTRAST_Flush();
758         dpsoftrast.fb_width = width;
759         dpsoftrast.fb_height = height;
760         dpsoftrast.fb_depthpixels = depthpixels;
761         dpsoftrast.fb_colorpixels[0] = colorpixels0;
762         dpsoftrast.fb_colorpixels[1] = colorpixels1;
763         dpsoftrast.fb_colorpixels[2] = colorpixels2;
764         dpsoftrast.fb_colorpixels[3] = colorpixels3;
765 }
766
767 static void DPSOFTRAST_Draw_FlushThreads(void);
768
769 static void DPSOFTRAST_Draw_SyncCommands(void)
770 {
771         if(dpsoftrast.usethreads) MEMORY_BARRIER;
772         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
773 }
774
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
776 {
777         DPSOFTRAST_State_Thread *thread;
778         int i;
779         int freecommand = dpsoftrast.commandpool.freecommand;
780         int usedcommands = dpsoftrast.commandpool.usedcommands;
781         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
782                 return;
783         DPSOFTRAST_Draw_SyncCommands();
784         for(;;)
785         {
786                 int waitindex = -1;
787                 int commandoffset;
788                 usedcommands = 0;
789                 for (i = 0; i < dpsoftrast.numthreads; i++)
790                 {
791                         thread = &dpsoftrast.threads[i]; 
792                         commandoffset = freecommand - thread->commandoffset;
793                         if (commandoffset < 0)
794                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795                         if (commandoffset > usedcommands)
796                         {
797                                 waitindex = i;
798                                 usedcommands = commandoffset;
799                         }
800                 }
801                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
802                         break;
803                 thread = &dpsoftrast.threads[waitindex];
804                 Thread_LockMutex(thread->drawmutex);
805                 if (thread->commandoffset != dpsoftrast.drawcommand)
806                 {
807                         thread->waiting = true;
808                         if (thread->starving) Thread_CondSignal(thread->drawcond);
809                         Thread_CondWait(thread->waitcond, thread->drawmutex);
810                         thread->waiting = false;
811                 }
812                 Thread_UnlockMutex(thread->drawmutex);
813         }
814         dpsoftrast.commandpool.usedcommands = usedcommands;
815 }
816
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
821
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
823 {
824         DPSOFTRAST_Command *command;
825         int freecommand = dpsoftrast.commandpool.freecommand;
826         int usedcommands = dpsoftrast.commandpool.usedcommands;
827         int extra = sizeof(DPSOFTRAST_Command);
828         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
831         {
832                 if (dpsoftrast.usethreads)
833                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 else
835                         DPSOFTRAST_Draw_FlushThreads();
836                 freecommand = dpsoftrast.commandpool.freecommand;
837                 usedcommands = dpsoftrast.commandpool.usedcommands;
838         }
839         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
840         {
841                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842                 command->opcode = DPSOFTRAST_OPCODE_Reset;
843                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844                 freecommand = 0;
845         }
846         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847         command->opcode = opcode;
848         command->commandsize = size;
849         freecommand += size;
850         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
851                 freecommand = 0;
852         dpsoftrast.commandpool.freecommand = freecommand;
853         dpsoftrast.commandpool.usedcommands = usedcommands + size;
854         return command;
855 }
856
857 static void DPSOFTRAST_UndoCommand(int size)
858 {
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         freecommand -= size;
862         if (freecommand < 0)
863                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864         usedcommands -= size;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands;
867 }
868                 
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
871 {
872         thread->viewport[0] = command->x;
873         thread->viewport[1] = command->y;
874         thread->viewport[2] = command->width;
875         thread->viewport[3] = command->height;
876         thread->validate |= DPSOFTRAST_VALIDATE_FB;
877 }
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
879 {
880         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881         command->x = x;
882         command->y = y;
883         command->width = width;
884         command->height = height;
885
886         dpsoftrast.viewport[0] = x;
887         dpsoftrast.viewport[1] = y;
888         dpsoftrast.viewport[2] = width;
889         dpsoftrast.viewport[3] = height;
890         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 }
892
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
895 {
896         int i, x1, y1, x2, y2, w, h, x, y;
897         int miny1 = thread->miny1;
898         int maxy1 = thread->maxy1;
899         int miny2 = thread->miny2;
900         int maxy2 = thread->maxy2;
901         int bandy;
902         unsigned int *p;
903         unsigned int c;
904         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905         x1 = thread->fb_scissor[0];
906         y1 = thread->fb_scissor[1];
907         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909         if (y1 < miny1) y1 = miny1;
910         if (y2 > maxy2) y2 = maxy2;
911         w = x2 - x1;
912         h = y2 - y1;
913         if (w < 1 || h < 1)
914                 return;
915         // FIXME: honor fb_colormask?
916         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917         for (i = 0;i < 4;i++)
918         {
919                 if (!dpsoftrast.fb_colorpixels[i])
920                         continue;
921                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922                 for (;y < bandy;y++)
923                 {
924                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925                         for (x = x1;x < x2;x++)
926                                 p[x] = c;
927                 }
928         }
929 }
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
931 {
932         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
933         command->r = r;
934         command->g = g;
935         command->b = b;
936         command->a = a;
937 }
938
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
941 {
942         int x1, y1, x2, y2, w, h, x, y;
943         int miny1 = thread->miny1;
944         int maxy1 = thread->maxy1;
945         int miny2 = thread->miny2;
946         int maxy2 = thread->maxy2;
947         int bandy;
948         unsigned int *p;
949         unsigned int c;
950         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963         for (;y < bandy;y++)
964         {
965                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966                 for (x = x1;x < x2;x++)
967                         p[x] = c;
968         }
969 }
970 void DPSOFTRAST_ClearDepth(float d)
971 {
972         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
973         command->depth = d;
974 }
975
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
978 {
979         thread->colormask[0] = command->r != 0;
980         thread->colormask[1] = command->g != 0;
981         thread->colormask[2] = command->b != 0;
982         thread->colormask[3] = command->a != 0;
983         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
984 }
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
986 {
987         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
996 {
997         thread->depthtest = command->enable;
998         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
999 }
1000 void DPSOFTRAST_DepthTest(int enable)
1001 {
1002         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003         command->enable = enable;
1004 }
1005
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1008 {
1009         thread->scissortest = command->enable;
1010         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 }
1012 void DPSOFTRAST_ScissorTest(int enable)
1013 {
1014         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015         command->enable = enable;
1016 }
1017
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1020 {
1021         thread->scissor[0] = command->x;
1022         thread->scissor[1] = command->y;
1023         thread->scissor[2] = command->width;
1024         thread->scissor[3] = command->height;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1028 {
1029         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030         command->x = x;
1031         command->y = y;
1032         command->width = width;
1033         command->height = height;
1034 }
1035
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1038 {
1039         thread->blendfunc[0] = command->sfactor;
1040         thread->blendfunc[1] = command->dfactor;
1041         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1042 }
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1044 {
1045         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1064 {
1065         thread->depthmask = command->enable;
1066 }
1067 void DPSOFTRAST_DepthMask(int enable)
1068 {
1069         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070         command->enable = enable;
1071 }
1072
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1075 {
1076         thread->depthfunc = command->func;
1077 }
1078 void DPSOFTRAST_DepthFunc(int func)
1079 {
1080         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081         command->func = func;
1082 }
1083
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1086 {
1087         thread->depthrange[0] = command->nearval;
1088         thread->depthrange[1] = command->farval;
1089 }
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1091 {
1092         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093         command->nearval = nearval;
1094         command->farval = farval;
1095 }
1096
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1099 {
1100         thread->polygonoffset[0] = command->alongnormal;
1101         thread->polygonoffset[1] = command->intoview;
1102 }
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1104 {
1105         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106         command->alongnormal = alongnormal;
1107         command->intoview = intoview;
1108 }
1109
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1112 {
1113         thread->cullface = command->mode;
1114 }
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118         command->mode = mode;
1119 }
1120
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1123 {
1124         thread->alphatest = command->enable;
1125 }
1126 void DPSOFTRAST_AlphaTest(int enable)
1127 {
1128         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->func = func;
1142         command->ref = ref;
1143 }
1144
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1146 {
1147         dpsoftrast.color[0] = r;
1148         dpsoftrast.color[1] = g;
1149         dpsoftrast.color[2] = b;
1150         dpsoftrast.color[3] = a;
1151 }
1152
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1154 {
1155         int outstride = blockwidth * 4;
1156         int instride = dpsoftrast.fb_width * 4;
1157         int bx1 = blockx;
1158         int by1 = blocky;
1159         int bx2 = blockx + blockwidth;
1160         int by2 = blocky + blockheight;
1161         int bw;
1162         int x;
1163         int y;
1164         unsigned char *inpixels;
1165         unsigned char *b;
1166         unsigned char *o;
1167         DPSOFTRAST_Flush();
1168         if (bx1 < 0) bx1 = 0;
1169         if (by1 < 0) by1 = 0;
1170         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1172         bw = bx2 - bx1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         DPSOFTRAST_Flush();
1227         spixels = dpsoftrast.fb_colorpixels[0];
1228         swidth = dpsoftrast.fb_width;
1229         sheight = dpsoftrast.fb_height;
1230         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231         twidth = texture->mipmap[mip][2];
1232         theight = texture->mipmap[mip][3];
1233         if (tx1 < 0) tx1 = 0;
1234         if (ty1 < 0) ty1 = 0;
1235         if (tx2 > twidth) tx2 = twidth;
1236         if (ty2 > theight) ty2 = theight;
1237         if (sx1 < 0) sx1 = 0;
1238         if (sy1 < 0) sy1 = 0;
1239         if (sx2 > swidth) sx2 = swidth;
1240         if (sy2 > sheight) sy2 = sheight;
1241         tw = tx2 - tx1;
1242         th = ty2 - ty1;
1243         sw = sx2 - sx1;
1244         sh = sy2 - sy1;
1245         if (tw > sw) tw = sw;
1246         if (th > sh) th = sh;
1247         if (tw < 1 || th < 1)
1248                 return;
1249         for (y = 0;y < th;y++)
1250                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1251         if (texture->mipmaps > 1)
1252                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1253 }
1254
1255 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1256 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1257 {
1258         if (thread->texbound[command->unitnum])
1259                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1260         thread->texbound[command->unitnum] = command->texture;
1261 }
1262 void DPSOFTRAST_SetTexture(int unitnum, int index)
1263 {
1264         DPSOFTRAST_Command_SetTexture *command;
1265         DPSOFTRAST_Texture *texture;
1266         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1267         {
1268                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1269                 return;
1270         }
1271         texture = DPSOFTRAST_Texture_GetByIndex(index);
1272         if (index && !texture)
1273         {
1274                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1275                 return;
1276         }
1277
1278         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1279         command->unitnum = unitnum;
1280         command->texture = texture;
1281
1282         dpsoftrast.texbound[unitnum] = texture;
1283         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1284 }
1285
1286 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1287 {
1288         dpsoftrast.pointer_vertex3f = vertex3f;
1289         dpsoftrast.stride_vertex = stride;
1290 }
1291 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1292 {
1293         dpsoftrast.pointer_color4f = color4f;
1294         dpsoftrast.pointer_color4ub = NULL;
1295         dpsoftrast.stride_color = stride;
1296 }
1297 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1298 {
1299         dpsoftrast.pointer_color4f = NULL;
1300         dpsoftrast.pointer_color4ub = color4ub;
1301         dpsoftrast.stride_color = stride;
1302 }
1303 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1304 {
1305         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1306         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1307         dpsoftrast.stride_texcoord[unitnum] = stride;
1308 }
1309
1310 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1311 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1312 {
1313         thread->shader_mode = command->mode;
1314         thread->shader_permutation = command->permutation;
1315 }
1316 void DPSOFTRAST_SetShader(int mode, int permutation)
1317 {
1318         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1319         command->mode = mode;
1320         command->permutation = permutation;
1321
1322         dpsoftrast.shader_mode = mode;
1323         dpsoftrast.shader_permutation = permutation;
1324 }
1325
1326 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1327 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1328 {
1329         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1330 }
1331 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1332 {
1333         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1334         command->index = index;
1335         command->val[0] = v0;
1336         command->val[1] = v1;
1337         command->val[2] = v2;
1338         command->val[3] = v3;
1339
1340         dpsoftrast.uniform4f[index*4+0] = v0;
1341         dpsoftrast.uniform4f[index*4+1] = v1;
1342         dpsoftrast.uniform4f[index*4+2] = v2;
1343         dpsoftrast.uniform4f[index*4+3] = v3;
1344 }
1345 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1346 {
1347         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348         command->index = index;
1349         memcpy(command->val, v, sizeof(command->val));
1350
1351         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1352 }
1353
1354 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1355 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1356 {
1357         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1358 }
1359 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1360 {
1361 #ifdef SSE2_PRESENT
1362         int i, index;
1363         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1364         {
1365                 __m128 m0, m1, m2, m3;
1366                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1367                 command->index = (DPSOFTRAST_UNIFORM)index;
1368                 if (((size_t)v)&(ALIGN_SIZE-1))
1369                 {
1370                         m0 = _mm_loadu_ps(v);
1371                         m1 = _mm_loadu_ps(v+4);
1372                         m2 = _mm_loadu_ps(v+8);
1373                         m3 = _mm_loadu_ps(v+12);
1374                 }
1375                 else
1376                 {
1377                         m0 = _mm_load_ps(v);
1378                         m1 = _mm_load_ps(v+4);
1379                         m2 = _mm_load_ps(v+8);
1380                         m3 = _mm_load_ps(v+12);
1381                 }
1382                 if (transpose)
1383                 {
1384                         __m128 t0, t1, t2, t3;
1385                         t0 = _mm_unpacklo_ps(m0, m1);
1386                         t1 = _mm_unpacklo_ps(m2, m3);
1387                         t2 = _mm_unpackhi_ps(m0, m1);
1388                         t3 = _mm_unpackhi_ps(m2, m3);
1389                         m0 = _mm_movelh_ps(t0, t1);
1390                         m1 = _mm_movehl_ps(t1, t0);
1391                         m2 = _mm_movelh_ps(t2, t3);
1392                         m3 = _mm_movehl_ps(t3, t2);                     
1393                 }
1394                 _mm_store_ps(command->val, m0);
1395                 _mm_store_ps(command->val+4, m1);
1396                 _mm_store_ps(command->val+8, m2);
1397                 _mm_store_ps(command->val+12, m3);
1398                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1402         }
1403 #endif
1404 }
1405
1406 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1407 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1408 {
1409         thread->uniform1i[command->index] = command->val;
1410 }
1411 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1412 {
1413         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1414         command->index = index;
1415         command->val = i0;
1416
1417         dpsoftrast.uniform1i[command->index] = i0;
1418 }
1419
1420 #ifdef SSE2_PRESENT
1421 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1422 {
1423         float *end = dst + size*4;
1424         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1425         {
1426                 while (dst < end)
1427                 {
1428                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1429                         dst += 4;
1430                         src += stride;
1431                 }
1432         }
1433         else
1434         {
1435                 while (dst < end)
1436                 {
1437                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1438                         dst += 4;
1439                         src += stride;
1440                 }
1441         }
1442 }
1443
1444 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1445 {
1446         float *end = dst + size*4;
1447         if (stride == sizeof(float[3]))
1448         {
1449                 float *end4 = dst + (size&~3)*4;        
1450                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1451                 {
1452                         while (dst < end4)
1453                         {
1454                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1455                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1456                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1459                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1460                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1462                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1463                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1464                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1465                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1466                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467                                 dst += 16;
1468                                 src += 4*sizeof(float[3]);
1469                         }
1470                 }
1471                 else
1472                 {
1473                         while (dst < end4)
1474                         {
1475                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1476                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1480                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1481                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1483                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1484                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1485                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1487                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488                                 dst += 16;
1489                                 src += 4*sizeof(float[3]);
1490                         }
1491                 }
1492         }
1493         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1494         {
1495                 while (dst < end)
1496                 {
1497                         __m128 v = _mm_loadu_ps((const float *)src);
1498                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501                         _mm_store_ps(dst, v);
1502                         dst += 4;
1503                         src += stride;
1504                 }
1505         }
1506         else
1507         {
1508                 while (dst < end)
1509                 {
1510                         __m128 v = _mm_load_ps((const float *)src);
1511                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1512                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1513                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1514                         _mm_store_ps(dst, v);
1515                         dst += 4;
1516                         src += stride;
1517                 }
1518         }
1519 }
1520
1521 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1522 {
1523         float *end = dst + size*4;
1524         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1525         if (stride == sizeof(float[2]))
1526         {
1527                 float *end2 = dst + (size&~1)*4;
1528                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1529                 {
1530                         while (dst < end2)
1531                         {
1532                                 __m128 v = _mm_loadu_ps((const float *)src);
1533                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1534                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1535                                 dst += 8;
1536                                 src += 2*sizeof(float[2]);
1537                         }
1538                 }
1539                 else
1540                 {
1541                         while (dst < end2)
1542                         {
1543                                 __m128 v = _mm_load_ps((const float *)src);
1544                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1545                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1546                                 dst += 8;
1547                                 src += 2*sizeof(float[2]);
1548                         }
1549                 }
1550         }
1551         while (dst < end)
1552         {
1553                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1554                 dst += 4;
1555                 src += stride;
1556         }
1557 }
1558
1559 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1560 {
1561         float *end = dst + size*4;
1562         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1563         if (stride == sizeof(unsigned char[4]))
1564         {
1565                 float *end4 = dst + (size&~3)*4;
1566                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1567                 {
1568                         while (dst < end4)
1569                         {
1570                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1575                                 dst += 16;
1576                                 src += 4*sizeof(unsigned char[4]);
1577                         }
1578                 }
1579                 else
1580                 {
1581                         while (dst < end4)
1582                         {
1583                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1584                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1585                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1588                                 dst += 16;
1589                                 src += 4*sizeof(unsigned char[4]);
1590                         }
1591                 }
1592         }
1593         while (dst < end)
1594         {
1595                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1596                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1597                 dst += 4;
1598                 src += stride;
1599         }
1600 }
1601
1602 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1603 {
1604         float *end = dst + 4*size;
1605         __m128 v = _mm_loadu_ps(src);
1606         while (dst < end)
1607         {
1608                 _mm_store_ps(dst, v);
1609                 dst += 4;
1610         }
1611 }
1612 #endif
1613
1614 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1615 {
1616 #ifdef SSE2_PRESENT
1617         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1618         __m128 m0, m1, m2, m3;
1619         float *end;
1620         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1621         {
1622                 // fast case for identity matrix
1623                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1624                 return;
1625         }
1626         end = out4f + numitems*4;
1627         m0 = _mm_loadu_ps(inmatrix16f);
1628         m1 = _mm_loadu_ps(inmatrix16f + 4);
1629         m2 = _mm_loadu_ps(inmatrix16f + 8);
1630         m3 = _mm_loadu_ps(inmatrix16f + 12);
1631         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1632         {
1633                 while (out4f < end)
1634                 {
1635                         __m128 v = _mm_loadu_ps(in4f);
1636                         _mm_store_ps(out4f,
1637                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1638                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1639                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1640                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1641                         out4f += 4;
1642                         in4f += 4;
1643                 }
1644         }
1645         else
1646         {
1647                 while (out4f < end)
1648                 {
1649                         __m128 v = _mm_load_ps(in4f);
1650                         _mm_store_ps(out4f,
1651                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655                         out4f += 4;
1656                         in4f += 4;
1657                 }
1658         }
1659 #endif
1660 }
1661
1662 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1663 {
1664         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1665 }
1666
1667 #ifdef SSE2_PRESENT
1668 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1669 { \
1670         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1671         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1672         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1673         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1674 }
1675
1676 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1677 { \
1678         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1679         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1680         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1681         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1682 }
1683
1684 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1685 { \
1686         __m128 p = (in); \
1687         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1688                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1689                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1690                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1691 }
1692
1693 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1694 {
1695         int clipmask = 0xFF;
1696         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1697         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1698         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1699         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1700         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1701         #define BBFRONT(k, pos) \
1702         { \
1703                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1704                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1705                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1706                 { \
1707                         __m128 proj; \
1708                         clipmask &= ~(1<<k); \
1709                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1710                         minproj = _mm_min_ss(minproj, proj); \
1711                         maxproj = _mm_max_ss(maxproj, proj); \
1712                 } \
1713         }
1714         BBFRONT(0, minpos); 
1715         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1716         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1717         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1721         BBFRONT(7, maxpos);
1722         #define BBCLIP(k) \
1723         { \
1724                 if (clipmask&(1<<k)) \
1725                 { \
1726                         if (!(clipmask&(1<<(k^1)))) \
1727                         { \
1728                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1729                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1730                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1731                                 minproj = _mm_min_ss(minproj, proj); \
1732                                 maxproj = _mm_max_ss(maxproj, proj); \
1733                         } \
1734                         if (!(clipmask&(1<<(k^2)))) \
1735                         { \
1736                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1737                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1738                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1739                                 minproj = _mm_min_ss(minproj, proj); \
1740                                 maxproj = _mm_max_ss(maxproj, proj); \
1741                         } \
1742                         if (!(clipmask&(1<<(k^4)))) \
1743                         { \
1744                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1745                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1746                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1747                                 minproj = _mm_min_ss(minproj, proj); \
1748                                 maxproj = _mm_max_ss(maxproj, proj); \
1749                         } \
1750                 } \
1751         }
1752         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1753         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1754         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1755         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1756         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1757         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1758         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1759         *starty = _mm_cvttss_si32(maxproj);
1760         *endy = _mm_cvttss_si32(minproj)+1;
1761         return clipmask;
1762 }
1763         
1764 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1765 {
1766         float *end = out4f + numitems*4;
1767         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1768         __m128 minpos, maxpos;
1769         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1770         {
1771                 minpos = maxpos = _mm_loadu_ps(in4f);
1772                 while (out4f < end)
1773                 {
1774                         __m128 v = _mm_loadu_ps(in4f);
1775                         minpos = _mm_min_ps(minpos, v);
1776                         maxpos = _mm_max_ps(maxpos, v);
1777                         _mm_store_ps(out4f, v);
1778                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1779                         _mm_store_ps(screen4f, v);
1780                         in4f += 4;
1781                         out4f += 4;
1782                         screen4f += 4;
1783                 }
1784         }
1785         else
1786         {
1787                 minpos = maxpos = _mm_load_ps(in4f);
1788                 while (out4f < end)
1789                 {
1790                         __m128 v = _mm_load_ps(in4f);
1791                         minpos = _mm_min_ps(minpos, v);
1792                         maxpos = _mm_max_ps(maxpos, v);
1793                         _mm_store_ps(out4f, v);
1794                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1795                         _mm_store_ps(screen4f, v);
1796                         in4f += 4;
1797                         out4f += 4;
1798                         screen4f += 4;
1799                 }
1800         }
1801         if (starty && endy) 
1802                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1803                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1804                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1805                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1806                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1807         return 0;
1808 }
1809
1810 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1811 {
1812         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1813         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1814         float *end;
1815         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1816                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1817         end = out4f + numitems*4;
1818         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1819         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1820         m0 = _mm_loadu_ps(inmatrix16f);
1821         m1 = _mm_loadu_ps(inmatrix16f + 4);
1822         m2 = _mm_loadu_ps(inmatrix16f + 8);
1823         m3 = _mm_loadu_ps(inmatrix16f + 12);
1824         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1825         {
1826                 minpos = maxpos = _mm_loadu_ps(in4f);
1827                 while (out4f < end)
1828                 {
1829                         __m128 v = _mm_loadu_ps(in4f);
1830                         minpos = _mm_min_ps(minpos, v);
1831                         maxpos = _mm_max_ps(maxpos, v);
1832                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1833                         _mm_store_ps(out4f, v);
1834                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1835                         _mm_store_ps(screen4f, v);
1836                         in4f += 4;
1837                         out4f += 4;
1838                         screen4f += 4;
1839                 }
1840         }
1841         else
1842         {
1843                 minpos = maxpos = _mm_load_ps(in4f);
1844                 while (out4f < end)
1845                 {
1846                         __m128 v = _mm_load_ps(in4f);
1847                         minpos = _mm_min_ps(minpos, v);
1848                         maxpos = _mm_max_ps(maxpos, v);
1849                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1850                         _mm_store_ps(out4f, v);
1851                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1852                         _mm_store_ps(screen4f, v);
1853                         in4f += 4;
1854                         out4f += 4;
1855                         screen4f += 4;
1856                 }
1857         }
1858         if (starty && endy) 
1859                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1860         return 0;
1861 }
1862 #endif
1863
1864 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1865 {
1866 #ifdef SSE2_PRESENT
1867         float *outf = dpsoftrast.post_array4f[outarray];
1868         const unsigned char *inb;
1869         int firstvertex = dpsoftrast.firstvertex;
1870         int numvertices = dpsoftrast.numvertices;
1871         int stride;
1872         switch(inarray)
1873         {
1874         case DPSOFTRAST_ARRAY_POSITION:
1875                 stride = dpsoftrast.stride_vertex;
1876                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1877                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1878                 break;
1879         case DPSOFTRAST_ARRAY_COLOR:
1880                 stride = dpsoftrast.stride_color;
1881                 if (dpsoftrast.pointer_color4f)
1882                 {
1883                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1884                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1885                 }
1886                 else if (dpsoftrast.pointer_color4ub)
1887                 {
1888                         stride = dpsoftrast.stride_color;
1889                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1890                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1891                 }
1892                 else
1893                 {
1894                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1895                 }
1896                 break;
1897         default:
1898                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1899                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1900                 {
1901                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1902                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1903                         {
1904                         case 2:
1905                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1906                                 break;
1907                         case 3:
1908                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1909                                 break;
1910                         case 4:
1911                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1912                                 break;
1913                         }
1914                 }
1915                 break;
1916         }
1917         return outf;
1918 #else
1919         return NULL;
1920 #endif
1921 }
1922
1923 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1924 {
1925         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1927         return data;
1928 }
1929
1930 #if 0
1931 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1932 {
1933 #ifdef SSE2_PRESENT
1934         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1936         return data;
1937 #else
1938         return NULL;
1939 #endif
1940 }
1941 #endif
1942
1943 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1944 {
1945 #ifdef SSE2_PRESENT
1946         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1947         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1948         return data;
1949 #else
1950         return NULL;
1951 #endif
1952 }
1953
1954 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1955 {
1956         int x;
1957         int startx = span->startx;
1958         int endx = span->endx;
1959         float wslope = triangle->w[0];
1960         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1961         float endz = 1.0f / (w + wslope * startx);
1962         for (x = startx;x < endx;)
1963         {
1964                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1965                 float z = endz, dz;
1966                 if (nextsub >= endx) nextsub = endsub = endx-1;
1967                 endz = 1.0f / (w + wslope * nextsub);
1968                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1969                 for (; x <= endsub; x++, z += dz)
1970                         zf[x] = z;
1971         }
1972 }
1973
1974 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1975 {
1976         int x;
1977         int startx = span->startx;
1978         int endx = span->endx;
1979         int d[4];
1980         float a, b;
1981         unsigned char * RESTRICT pixelmask = span->pixelmask;
1982         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1983         if (!pixel)
1984                 return;
1985         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1986         // handle alphatest now (this affects depth writes too)
1987         if (thread->alphatest)
1988                 for (x = startx;x < endx;x++)
1989                         if (in4f[x*4+3] < 0.5f)
1990                                 pixelmask[x] = false;
1991         // FIXME: this does not handle bigendian
1992         switch(thread->fb_blendmode)
1993         {
1994         case DPSOFTRAST_BLENDMODE_OPAQUE:
1995                 for (x = startx;x < endx;x++)
1996                 {
1997                         if (!pixelmask[x])
1998                                 continue;
1999                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2000                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2001                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2002                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2003                         pixel[x*4+0] = d[0];
2004                         pixel[x*4+1] = d[1];
2005                         pixel[x*4+2] = d[2];
2006                         pixel[x*4+3] = d[3];
2007                 }
2008                 break;
2009         case DPSOFTRAST_BLENDMODE_ALPHA:
2010                 for (x = startx;x < endx;x++)
2011                 {
2012                         if (!pixelmask[x])
2013                                 continue;
2014                         a = in4f[x*4+3] * 255.0f;
2015                         b = 1.0f - in4f[x*4+3];
2016                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2017                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2018                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2019                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2020                         pixel[x*4+0] = d[0];
2021                         pixel[x*4+1] = d[1];
2022                         pixel[x*4+2] = d[2];
2023                         pixel[x*4+3] = d[3];
2024                 }
2025                 break;
2026         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2027                 for (x = startx;x < endx;x++)
2028                 {
2029                         if (!pixelmask[x])
2030                                 continue;
2031                         a = in4f[x*4+3] * 255.0f;
2032                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2033                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2034                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2035                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2036                         pixel[x*4+0] = d[0];
2037                         pixel[x*4+1] = d[1];
2038                         pixel[x*4+2] = d[2];
2039                         pixel[x*4+3] = d[3];
2040                 }
2041                 break;
2042         case DPSOFTRAST_BLENDMODE_ADD:
2043                 for (x = startx;x < endx;x++)
2044                 {
2045                         if (!pixelmask[x])
2046                                 continue;
2047                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2048                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2049                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2050                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2051                         pixel[x*4+0] = d[0];
2052                         pixel[x*4+1] = d[1];
2053                         pixel[x*4+2] = d[2];
2054                         pixel[x*4+3] = d[3];
2055                 }
2056                 break;
2057         case DPSOFTRAST_BLENDMODE_INVMOD:
2058                 for (x = startx;x < endx;x++)
2059                 {
2060                         if (!pixelmask[x])
2061                                 continue;
2062                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066                         pixel[x*4+0] = d[0];
2067                         pixel[x*4+1] = d[1];
2068                         pixel[x*4+2] = d[2];
2069                         pixel[x*4+3] = d[3];
2070                 }
2071                 break;
2072         case DPSOFTRAST_BLENDMODE_MUL:
2073                 for (x = startx;x < endx;x++)
2074                 {
2075                         if (!pixelmask[x])
2076                                 continue;
2077                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081                         pixel[x*4+0] = d[0];
2082                         pixel[x*4+1] = d[1];
2083                         pixel[x*4+2] = d[2];
2084                         pixel[x*4+3] = d[3];
2085                 }
2086                 break;
2087         case DPSOFTRAST_BLENDMODE_MUL2:
2088                 for (x = startx;x < endx;x++)
2089                 {
2090                         if (!pixelmask[x])
2091                                 continue;
2092                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2093                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2094                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2095                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2096                         pixel[x*4+0] = d[0];
2097                         pixel[x*4+1] = d[1];
2098                         pixel[x*4+2] = d[2];
2099                         pixel[x*4+3] = d[3];
2100                 }
2101                 break;
2102         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2103                 for (x = startx;x < endx;x++)
2104                 {
2105                         if (!pixelmask[x])
2106                                 continue;
2107                         a = in4f[x*4+3] * -255.0f;
2108                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2109                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2110                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2111                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2112                         pixel[x*4+0] = d[0];
2113                         pixel[x*4+1] = d[1];
2114                         pixel[x*4+2] = d[2];
2115                         pixel[x*4+3] = d[3];
2116                 }
2117                 break;
2118         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2119                 for (x = startx;x < endx;x++)
2120                 {
2121                         if (!pixelmask[x])
2122                                 continue;
2123                         a = 255.0f;
2124                         b = 1.0f - in4f[x*4+3];
2125                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2126                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2127                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2128                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2129                         pixel[x*4+0] = d[0];
2130                         pixel[x*4+1] = d[1];
2131                         pixel[x*4+2] = d[2];
2132                         pixel[x*4+3] = d[3];
2133                 }
2134                 break;
2135         case DPSOFTRAST_BLENDMODE_INVADD:
2136                 for (x = startx;x < endx;x++)
2137                 {
2138                         if (!pixelmask[x])
2139                                 continue;
2140                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2141                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2142                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2143                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2144                         pixel[x*4+0] = d[0];
2145                         pixel[x*4+1] = d[1];
2146                         pixel[x*4+2] = d[2];
2147                         pixel[x*4+3] = d[3];
2148                 }
2149                 break;
2150         }
2151 }
2152
2153 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2154 {
2155 #ifdef SSE2_PRESENT
2156         int x;
2157         int startx = span->startx;
2158         int endx = span->endx;
2159         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2160         unsigned char * RESTRICT pixelmask = span->pixelmask;
2161         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2162         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2163         if (!pixel)
2164                 return;
2165         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2166         pixeli += span->y * dpsoftrast.fb_width + span->x;
2167         // handle alphatest now (this affects depth writes too)
2168         if (thread->alphatest)
2169                 for (x = startx;x < endx;x++)
2170                         if (in4ub[x*4+3] < 0.5f)
2171                                 pixelmask[x] = false;
2172         // FIXME: this does not handle bigendian
2173         switch(thread->fb_blendmode)
2174         {
2175         case DPSOFTRAST_BLENDMODE_OPAQUE:
2176                 for (x = startx;x + 4 <= endx;)
2177                 {
2178                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2179                         {
2180                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2181                                 x += 4;
2182                         }
2183                         else
2184                         {
2185                                 if (pixelmask[x])
2186                                         pixeli[x] = ini[x];
2187                                 x++;
2188                         }
2189                 }
2190                 for (;x < endx;x++)
2191                         if (pixelmask[x])
2192                                 pixeli[x] = ini[x];
2193                 break;
2194         case DPSOFTRAST_BLENDMODE_ALPHA:
2195         #define FINISHBLEND(blend2, blend1) \
2196                 for (x = startx;x + 1 < endx;x += 2) \
2197                 { \
2198                         __m128i src, dst; \
2199                         switch (*(const unsigned short*)&pixelmask[x]) \
2200                         { \
2201                         case 0x0101: \
2202                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2203                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2204                                 blend2; \
2205                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2206                                 continue; \
2207                         case 0x0100: \
2208                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2209                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2210                                 blend1; \
2211                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2212                                 continue; \
2213                         case 0x0001: \
2214                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2215                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2216                                 blend1; \
2217                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2218                                 continue; \
2219                         } \
2220                         break; \
2221                 } \
2222                 for(;x < endx; x++) \
2223                 { \
2224                         __m128i src, dst; \
2225                         if (!pixelmask[x]) \
2226                                 continue; \
2227                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2228                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2229                         blend1; \
2230                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2231                 }
2232
2233                 FINISHBLEND({
2234                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2236                 }, {
2237                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2239                 });
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2242                 FINISHBLEND({
2243                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2244                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2245                 }, {
2246                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2247                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2248                 });
2249                 break;
2250         case DPSOFTRAST_BLENDMODE_ADD:
2251                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2252                 break;
2253         case DPSOFTRAST_BLENDMODE_INVMOD:
2254                 FINISHBLEND({
2255                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2256                 }, {
2257                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258                 });
2259                 break;
2260         case DPSOFTRAST_BLENDMODE_MUL:
2261                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2262                 break;
2263         case DPSOFTRAST_BLENDMODE_MUL2:
2264                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2265                 break;
2266         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2267                 FINISHBLEND({
2268                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2270                 }, {
2271                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2273                 });
2274                 break;
2275         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2276                 FINISHBLEND({
2277                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2278                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2279                 }, {
2280                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2281                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2282                 });
2283                 break;
2284         case DPSOFTRAST_BLENDMODE_INVADD:
2285                 FINISHBLEND({
2286                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287                 }, {
2288                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289                 });
2290                 break;
2291         }
2292 #endif
2293 }
2294
2295 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2296 {
2297         int x;
2298         int startx = span->startx;
2299         int endx = span->endx;
2300         int flags;
2301         float c[4];
2302         float data[4];
2303         float slope[4];
2304         float tc[2], endtc[2];
2305         float tcscale[2];
2306         unsigned int tci[2];
2307         unsigned int tci1[2];
2308         unsigned int tcimin[2];
2309         unsigned int tcimax[2];
2310         int tciwrapmask[2];
2311         int tciwidth;
2312         int filter;
2313         int mip;
2314         const unsigned char * RESTRICT pixelbase;
2315         const unsigned char * RESTRICT pixel[4];
2316         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2317         // if no texture is bound, just fill it with white
2318         if (!texture)
2319         {
2320                 for (x = startx;x < endx;x++)
2321                 {
2322                         out4f[x*4+0] = 1.0f;
2323                         out4f[x*4+1] = 1.0f;
2324                         out4f[x*4+2] = 1.0f;
2325                         out4f[x*4+3] = 1.0f;
2326                 }
2327                 return;
2328         }
2329         mip = triangle->mip[texunitindex];
2330         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2331         // if this mipmap of the texture is 1 pixel, just fill it with that color
2332         if (texture->mipmap[mip][1] == 4)
2333         {
2334                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2335                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2336                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2337                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2338                 for (x = startx;x < endx;x++)
2339                 {
2340                         out4f[x*4+0] = c[0];
2341                         out4f[x*4+1] = c[1];
2342                         out4f[x*4+2] = c[2];
2343                         out4f[x*4+3] = c[3];
2344                 }
2345                 return;
2346         }
2347         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2348         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2349         flags = texture->flags;
2350         tcscale[0] = texture->mipmap[mip][2];
2351         tcscale[1] = texture->mipmap[mip][3];
2352         tciwidth = texture->mipmap[mip][2];
2353         tcimin[0] = 0;
2354         tcimin[1] = 0;
2355         tcimax[0] = texture->mipmap[mip][2]-1;
2356         tcimax[1] = texture->mipmap[mip][3]-1;
2357         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2358         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2359         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2360         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2361         for (x = startx;x < endx;)
2362         {
2363                 unsigned int subtc[2];
2364                 unsigned int substep[2];
2365                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2366                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2367                 if (nextsub >= endx)
2368                 {
2369                         nextsub = endsub = endx-1;      
2370                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2371                 }
2372                 tc[0] = endtc[0];
2373                 tc[1] = endtc[1];
2374                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2375                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2376                 substep[0] = (endtc[0] - tc[0]) * subscale;
2377                 substep[1] = (endtc[1] - tc[1]) * subscale;
2378                 subtc[0] = tc[0] * (1<<16);
2379                 subtc[1] = tc[1] * (1<<16);
2380                 if (filter)
2381                 {
2382                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2383                         {
2384                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2385                                 {
2386                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2387                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2388                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2389                                         tci[0] = subtc[0]>>16;
2390                                         tci[1] = subtc[1]>>16;
2391                                         tci1[0] = tci[0] + 1;
2392                                         tci1[1] = tci[1] + 1;
2393                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2394                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2395                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2396                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2397                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2398                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2399                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2400                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2401                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2402                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2403                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2404                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2405                                         out4f[x*4+0] = c[0];
2406                                         out4f[x*4+1] = c[1];
2407                                         out4f[x*4+2] = c[2];
2408                                         out4f[x*4+3] = c[3];
2409                                 }
2410                         }
2411                         else
2412                         {
2413                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2414                                 {
2415                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2416                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2417                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2418                                         tci[0] = subtc[0]>>16;
2419                                         tci[1] = subtc[1]>>16;
2420                                         tci1[0] = tci[0] + 1;
2421                                         tci1[1] = tci[1] + 1;
2422                                         tci[0] &= tciwrapmask[0];
2423                                         tci[1] &= tciwrapmask[1];
2424                                         tci1[0] &= tciwrapmask[0];
2425                                         tci1[1] &= tciwrapmask[1];
2426                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2427                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2428                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2429                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2430                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2431                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         out4f[x*4+0] = c[0];
2435                                         out4f[x*4+1] = c[1];
2436                                         out4f[x*4+2] = c[2];
2437                                         out4f[x*4+3] = c[3];
2438                                 }
2439                         }
2440                 }
2441                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2442                 {
2443                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2444                         {
2445                                 tci[0] = subtc[0]>>16;
2446                                 tci[1] = subtc[1]>>16;
2447                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2448                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2449                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2450                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2451                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2452                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2453                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2454                                 out4f[x*4+0] = c[0];
2455                                 out4f[x*4+1] = c[1];
2456                                 out4f[x*4+2] = c[2];
2457                                 out4f[x*4+3] = c[3];
2458                         }
2459                 }
2460                 else
2461                 {
2462                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2463                         {
2464                                 tci[0] = subtc[0]>>16;
2465                                 tci[1] = subtc[1]>>16;
2466                                 tci[0] &= tciwrapmask[0];
2467                                 tci[1] &= tciwrapmask[1];
2468                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2469                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2470                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2471                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2472                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2473                                 out4f[x*4+0] = c[0];
2474                                 out4f[x*4+1] = c[1];
2475                                 out4f[x*4+2] = c[2];
2476                                 out4f[x*4+3] = c[3];
2477                         }
2478                 }
2479         }
2480 }
2481
2482 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2483 {
2484 #ifdef SSE2_PRESENT
2485         int x;
2486         int startx = span->startx;
2487         int endx = span->endx;
2488         int flags;
2489         __m128 data, slope, tcscale;
2490         __m128i tcsize, tcmask, tcoffset, tcmax;
2491         __m128 tc, endtc;
2492         __m128i subtc, substep, endsubtc;
2493         int filter;
2494         int mip;
2495         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2496         const unsigned char * RESTRICT pixelbase;
2497         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2498         // if no texture is bound, just fill it with white
2499         if (!texture)
2500         {
2501                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2502                 return;
2503         }
2504         mip = triangle->mip[texunitindex];
2505         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2506         // if this mipmap of the texture is 1 pixel, just fill it with that color
2507         if (texture->mipmap[mip][1] == 4)
2508         {
2509                 unsigned int k = *((const unsigned int *)pixelbase);
2510                 for (x = startx;x < endx;x++)
2511                         outi[x] = k;
2512                 return;
2513         }
2514         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2515         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2516         flags = texture->flags;
2517         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2518         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2519         tcscale = _mm_cvtepi32_ps(tcsize);
2520         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2521         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2522         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2523         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2524         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2525         tcmax = _mm_packs_epi32(tcmask, tcmask);
2526         for (x = startx;x < endx;)
2527         {
2528                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2529                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2530                 if (nextsub >= endx)
2531                 {
2532                         nextsub = endsub = endx-1;
2533                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2534                 }       
2535                 tc = endtc;
2536                 subtc = endsubtc;
2537                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2538                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2539                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2540                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2541                 substep = _mm_slli_epi32(substep, 1);
2542                 if (filter)
2543                 {
2544                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2545                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2546                         {
2547                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2548                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2549                                 {
2550                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2551                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2552                                         tci = _mm_madd_epi16(tci, tcoffset);
2553                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2554                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2555                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2556                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2557                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2558                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2559                                         fracm = _mm_srli_epi16(subtc, 1);
2560                                         pix1 = _mm_add_epi16(pix1,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2563                                         pix3 = _mm_add_epi16(pix3,
2564                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2565                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2566                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2567                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2568                                         pix2 = _mm_add_epi16(pix2,
2569                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2570                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2571                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2572                                 }
2573                                 if (x <= endsub)
2574                                 {
2575                                         const unsigned char * RESTRICT ptr1;
2576                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2577                                         tci = _mm_madd_epi16(tci, tcoffset);
2578                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2579                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2580                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2581                                         fracm = _mm_srli_epi16(subtc, 1);
2582                                         pix1 = _mm_add_epi16(pix1,
2583                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2584                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2585                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2586                                         pix1 = _mm_add_epi16(pix1,
2587                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2589                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2590                                         x++;
2591                                 }
2592                         }
2593                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2594                         {
2595                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2596                                 {
2597                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2598                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2599                                         tci = _mm_madd_epi16(tci, tcoffset);
2600                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602                                                                                         _mm_setzero_si128());
2603                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2607                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2608                                         tci = _mm_madd_epi16(tci, tcoffset);
2609                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2610                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2611                                                                                         _mm_setzero_si128());
2612                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2613                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2614                                                                                         _mm_setzero_si128());
2615                                         fracm = _mm_srli_epi16(subtc, 1);
2616                                         pix1 = _mm_add_epi16(pix1,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2619                                         pix3 = _mm_add_epi16(pix3,
2620                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2621                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2622                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2623                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2624                                         pix2 = _mm_add_epi16(pix2,
2625                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2626                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2627                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2628                                 }
2629                                 if (x <= endsub)
2630                                 {
2631                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2632                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2633                                         tci = _mm_madd_epi16(tci, tcoffset);
2634                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2635                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2636                                                                                         _mm_setzero_si128());
2637                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2638                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2639                                                                                         _mm_setzero_si128());
2640                                         fracm = _mm_srli_epi16(subtc, 1);
2641                                         pix1 = _mm_add_epi16(pix1,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2645                                         pix1 = _mm_add_epi16(pix1,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2647                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2648                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2649                                         x++;
2650                                 }
2651                         }
2652                         else
2653                         {
2654                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2655                                 {
2656                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2657                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661                                                                                         _mm_setzero_si128());
2662                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2666                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2667                                         tci = _mm_madd_epi16(tci, tcoffset);
2668                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670                                                                                         _mm_setzero_si128());
2671                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673                                                                                         _mm_setzero_si128());
2674                                         fracm = _mm_srli_epi16(subtc, 1);
2675                                         pix1 = _mm_add_epi16(pix1,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2678                                         pix3 = _mm_add_epi16(pix3,
2679                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2680                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2681                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2682                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2683                                         pix2 = _mm_add_epi16(pix2,
2684                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2685                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2686                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2687                                 }
2688                                 if (x <= endsub)
2689                                 {
2690                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2691                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2692                                         tci = _mm_madd_epi16(tci, tcoffset);
2693                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2694                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2695                                                                                         _mm_setzero_si128());
2696                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2697                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2698                                                                                         _mm_setzero_si128());
2699                                         fracm = _mm_srli_epi16(subtc, 1);
2700                                         pix1 = _mm_add_epi16(pix1,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704                                         pix1 = _mm_add_epi16(pix1,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2708                                         x++;
2709                                 }
2710                         }
2711                 }
2712                 else
2713                 {
2714                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2715                         {
2716                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2717                                 {
2718                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2719                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2720                                         tci = _mm_madd_epi16(tci, tcoffset);
2721                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2722                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2723                                 }
2724                                 if (x <= endsub)
2725                                 {
2726                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2727                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2730                                         x++;
2731                                 }
2732                         }
2733                         else
2734                         {
2735                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2736                                 {
2737                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2738                                         tci = _mm_and_si128(tci, tcmax); 
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2742                                 }
2743                                 if (x <= endsub)
2744                                 {
2745                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2746                                         tci = _mm_and_si128(tci, tcmax); 
2747                                         tci = _mm_madd_epi16(tci, tcoffset);
2748                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2749                                         x++;
2750                                 }
2751                         }
2752                 }
2753         }
2754 #endif
2755 }
2756
2757 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2758 {
2759         // TODO: IMPLEMENT
2760         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2761 }
2762
2763 float DPSOFTRAST_SampleShadowmap(const float *vector)
2764 {
2765         // TODO: IMPLEMENT
2766         return 1.0f;
2767 }
2768
2769 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2770 {
2771         int x;
2772         int startx = span->startx;
2773         int endx = span->endx;
2774         float c[4];
2775         float data[4];
2776         float slope[4];
2777         float z;
2778         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2779         for (x = startx;x < endx;x++)
2780         {
2781                 z = zf[x];
2782                 c[0] = (data[0] + slope[0]*x) * z;
2783                 c[1] = (data[1] + slope[1]*x) * z;
2784                 c[2] = (data[2] + slope[2]*x) * z;
2785                 c[3] = (data[3] + slope[3]*x) * z;
2786                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2787                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2788                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2789                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2790         }
2791 }
2792
2793 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2794 {
2795         int x;
2796         int startx = span->startx;
2797         int endx = span->endx;
2798         float c[4];
2799         float data[4];
2800         float slope[4];
2801         float z;
2802         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2803         for (x = startx;x < endx;x++)
2804         {
2805                 z = zf[x];
2806                 c[0] = (data[0] + slope[0]*x) * z;
2807                 c[1] = (data[1] + slope[1]*x) * z;
2808                 c[2] = (data[2] + slope[2]*x) * z;
2809                 c[3] = (data[3] + slope[3]*x) * z;
2810                 out4f[x*4+0] = c[0];
2811                 out4f[x*4+1] = c[1];
2812                 out4f[x*4+2] = c[2];
2813                 out4f[x*4+3] = c[3];
2814         }
2815 }
2816
2817 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2818 {
2819         int x, startx = span->startx, endx = span->endx;
2820         float c[4], localcolor[4];
2821         localcolor[0] = subcolor[0];
2822         localcolor[1] = subcolor[1];
2823         localcolor[2] = subcolor[2];
2824         localcolor[3] = subcolor[3];
2825         for (x = startx;x < endx;x++)
2826         {
2827                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2828                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2829                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2830                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2831                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2832                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2833                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2834                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2835         }
2836 }
2837
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2839 {
2840         int x, startx = span->startx, endx = span->endx;
2841         for (x = startx;x < endx;x++)
2842         {
2843                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2844                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2845                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2846                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2847         }
2848 }
2849
2850 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2851 {
2852         int x, startx = span->startx, endx = span->endx;
2853         for (x = startx;x < endx;x++)
2854         {
2855                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2856                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2857                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2858                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2859         }
2860 }
2861
2862 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2863 {
2864         int x, startx = span->startx, endx = span->endx;
2865         float a, b;
2866         for (x = startx;x < endx;x++)
2867         {
2868                 a = 1.0f - inb4f[x*4+3];
2869                 b = inb4f[x*4+3];
2870                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2871                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2872                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2873                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2874         }
2875 }
2876
2877 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2878 {
2879         int x, startx = span->startx, endx = span->endx;
2880         float localcolor[4], ilerp, lerp;
2881         localcolor[0] = color[0];
2882         localcolor[1] = color[1];
2883         localcolor[2] = color[2];
2884         localcolor[3] = color[3];
2885         ilerp = 1.0f - localcolor[3];
2886         lerp = localcolor[3];
2887         for (x = startx;x < endx;x++)
2888         {
2889                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2890                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2891                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2892                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2893         }
2894 }
2895
2896
2897
2898 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2899 {
2900 #ifdef SSE2_PRESENT
2901         int x;
2902         int startx = span->startx;
2903         int endx = span->endx;
2904         __m128 data, slope;
2905         __m128 mod, endmod;
2906         __m128i submod, substep, endsubmod;
2907         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2908         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2909         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2910         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2911         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912         for (x = startx; x < endx;)
2913         {
2914                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2915                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2916                 if (nextsub >= endx)
2917                 {
2918                         nextsub = endsub = endx-1;
2919                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2920                 }
2921                 mod = endmod;
2922                 submod = endsubmod;
2923                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2924                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2925                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2927                 substep = _mm_packs_epi32(substep, substep);
2928                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2929                 {
2930                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2931                         pix = _mm_mulhi_epu16(pix, submod);
2932                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2933                 }
2934                 if (x <= endsub)
2935                 {
2936                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2937                         pix = _mm_mulhi_epu16(pix, submod);
2938                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2939                         x++;
2940                 }
2941         }
2942 #endif
2943 }
2944
2945 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2946 {
2947 #ifdef SSE2_PRESENT
2948         int x;
2949         int startx = span->startx;
2950         int endx = span->endx;
2951         __m128 data, slope;
2952         __m128 mod, endmod;
2953         __m128i submod, substep, endsubmod;
2954         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2955         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2956         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2957         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2958         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959         for (x = startx; x < endx;)
2960         {
2961                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2962                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2963                 if (nextsub >= endx)
2964                 {
2965                         nextsub = endsub = endx-1;
2966                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2967                 }
2968                 mod = endmod;
2969                 submod = endsubmod;
2970                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2971                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2972                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2974                 substep = _mm_packs_epi32(substep, substep);
2975                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2976                 {
2977                         __m128i pix = _mm_srai_epi16(submod, 4);
2978                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2979                 }
2980                 if (x <= endsub)
2981                 {
2982                         __m128i pix = _mm_srai_epi16(submod, 4);
2983                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2984                         x++;
2985                 }
2986         }
2987 #endif
2988 }
2989
2990 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2991 {
2992 #ifdef SSE2_PRESENT
2993         int x, startx = span->startx, endx = span->endx;
2994         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2995         localcolor = _mm_packs_epi32(localcolor, localcolor);
2996         for (x = startx;x+2 <= endx;x+=2)
2997         {
2998                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2999                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3000                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3002         }
3003         if (x < endx)
3004         {
3005                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3006                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3007                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3008                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3009         }
3010 #endif
3011 }
3012
3013 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3014 {
3015 #ifdef SSE2_PRESENT
3016         int x, startx = span->startx, endx = span->endx;
3017         for (x = startx;x+2 <= endx;x+=2)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3023         }
3024         if (x < endx)
3025         {
3026                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3029                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030         }
3031 #endif
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3035 {
3036 #ifdef SSE2_PRESENT
3037         int x, startx = span->startx, endx = span->endx;
3038         for (x = startx;x+2 <= endx;x+=2)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042                 pix1 = _mm_add_epi16(pix1, pix2);
3043                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3044         }
3045         if (x < endx)
3046         {
3047                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3049                 pix1 = _mm_add_epi16(pix1, pix2);
3050                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3051         }
3052 #endif
3053 }
3054
3055 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3056 {
3057 #ifdef SSE2_PRESENT
3058         int x, startx = span->startx, endx = span->endx;
3059         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3060         tint = _mm_packs_epi32(tint, tint);
3061         for (x = startx;x+2 <= endx;x+=2)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3067         }
3068         if (x < endx)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3073                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3074         }
3075 #endif
3076 }
3077
3078 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3079 {
3080 #ifdef SSE2_PRESENT
3081         int x, startx = span->startx, endx = span->endx;
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3087                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3089         }
3090         if (x < endx)
3091         {
3092                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3095                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3096                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3097         }
3098 #endif
3099 }
3100
3101 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3102 {
3103 #ifdef SSE2_PRESENT
3104         int x, startx = span->startx, endx = span->endx;
3105         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3106         localcolor = _mm_packs_epi32(localcolor, localcolor);
3107         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3108         for (x = startx;x+2 <= endx;x+=2)
3109         {
3110                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3111                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3113         }
3114         if (x < endx)
3115         {
3116                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3117                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3119         }
3120 #endif
3121 }
3122
3123
3124
3125 void DPSOFTRAST_VertexShader_Generic(void)
3126 {
3127         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3128         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3131                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3132 }
3133
3134 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3135 {
3136         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3141         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3142         {
3143                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3144                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3145                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3146                 {
3147                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3148                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3149                         {
3150                                 // multiply
3151                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3152                         }
3153                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3154                         {
3155                                 // add
3156                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3157                         }
3158                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3159                         {
3160                                 // alphablend
3161                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3162                         }
3163                 }
3164         }
3165         else
3166                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3167         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3168 }
3169
3170
3171
3172 void DPSOFTRAST_VertexShader_PostProcess(void)
3173 {
3174         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3175         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3176         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3177 }
3178
3179 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3180 {
3181         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3182         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3183         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3186         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3187         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3188         {
3189                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3190                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3191         }
3192         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3193         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3194         {
3195                 // TODO: implement saturation
3196         }
3197         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3198         {
3199                 // TODO: implement gammaramps
3200         }
3201         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3202 }
3203
3204
3205
3206 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3207 {
3208         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3209 }
3210
3211 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3212 {
3213         // this is never called (because colormask is off when this shader is used)
3214         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3215         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3216         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3217         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3218         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3219 }
3220
3221
3222
3223 void DPSOFTRAST_VertexShader_FlatColor(void)
3224 {
3225         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3227 }
3228
3229 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3230 {
3231 #ifdef SSE2_PRESENT
3232         unsigned char * RESTRICT pixelmask = span->pixelmask;
3233         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3234         int x, startx = span->startx, endx = span->endx;
3235         __m128i Color_Ambientm;
3236         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3237         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3241         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3242                 pixel = buffer_FragColorbgra8;
3243         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3244         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3245         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3246         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3247         for (x = startx;x < endx;x++)
3248         {
3249                 __m128i color, pix;
3250                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3251                 {
3252                         __m128i pix2;
3253                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3255                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3256                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3257                         x += 3;
3258                         continue;
3259                 }
3260                 if (!pixelmask[x])
3261                         continue;
3262                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3263                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3264                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3265         }
3266         if (pixel == buffer_FragColorbgra8)
3267                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3268 #endif
3269 }
3270
3271
3272
3273 void DPSOFTRAST_VertexShader_VertexColor(void)
3274 {
3275         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3276         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3277         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3278 }
3279
3280 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3281 {
3282 #ifdef SSE2_PRESENT
3283         unsigned char * RESTRICT pixelmask = span->pixelmask;
3284         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285         int x, startx = span->startx, endx = span->endx;
3286         __m128i Color_Ambientm, Color_Diffusem;
3287         __m128 data, slope;
3288         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3289         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3292         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3293         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3294         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3295                 pixel = buffer_FragColorbgra8;
3296         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3297         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3298         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3299         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3300         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3301         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3302         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3303         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3304         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3305         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3306         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3307         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3308         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3309         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3310         {
3311                 __m128i color, mod, pix;
3312                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3313                 {
3314                         __m128i pix2, mod2;
3315                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3316                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3317                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3318                         data = _mm_add_ps(data, slope);
3319                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3320                         data = _mm_add_ps(data, slope);
3321                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3322                         data = _mm_add_ps(data, slope);
3323                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3324                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3325                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3326                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3327                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3328                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3329                         x += 3;
3330                         continue;
3331                 }
3332                 if (!pixelmask[x])
3333                         continue;
3334                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3335                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3336                 mod = _mm_packs_epi32(mod, mod);
3337                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3338                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3339         }
3340         if (pixel == buffer_FragColorbgra8)
3341                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3342 #endif
3343 }
3344
3345
3346
3347 void DPSOFTRAST_VertexShader_Lightmap(void)
3348 {
3349         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3350         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3351         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3352 }
3353
3354 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3355 {
3356 #ifdef SSE2_PRESENT
3357         unsigned char * RESTRICT pixelmask = span->pixelmask;
3358         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3359         int x, startx = span->startx, endx = span->endx;
3360         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3361         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3367         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3369         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3370                 pixel = buffer_FragColorbgra8;
3371         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3372         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3373         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3374         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3375         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3378         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3379         {
3380                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3381                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3384                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3385                 for (x = startx;x < endx;x++)
3386                 {
3387                         __m128i color, lightmap, glow, pix;
3388                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3389                         {
3390                                 __m128i pix2;
3391                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3393                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3394                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3395                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3396                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3397                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3398                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3399                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3400                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3401                                 x += 3;
3402                                 continue;
3403                         }
3404                         if (!pixelmask[x])
3405                                 continue;
3406                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3407                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3408                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3409                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3410                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3411                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3412                 }
3413         }
3414         else
3415         {
3416                 for (x = startx;x < endx;x++)
3417                 {
3418                         __m128i color, lightmap, pix;
3419                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3420                         {
3421                                 __m128i pix2;
3422                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3423                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3424                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3425                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3427                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3429                                 x += 3;
3430                                 continue;
3431                         }
3432                         if (!pixelmask[x]) 
3433                                 continue;
3434                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3436                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3437                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3438                 }
3439         }
3440         if (pixel == buffer_FragColorbgra8)
3441                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3442 #endif
3443 }
3444
3445
3446
3447 void DPSOFTRAST_VertexShader_FakeLight(void)
3448 {
3449         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3450 }
3451
3452 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3453 {
3454         // TODO: IMPLEMENT
3455         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3456         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3457         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3458         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3459         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3460 }
3461
3462
3463
3464 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3465 {
3466         DPSOFTRAST_VertexShader_Lightmap();
3467 }
3468
3469 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3470 {
3471         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3472         // TODO: IMPLEMENT
3473 }
3474
3475
3476
3477 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3478 {
3479         DPSOFTRAST_VertexShader_Lightmap();
3480 }
3481
3482 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3483 {
3484         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3485         // TODO: IMPLEMENT
3486 }
3487
3488
3489
3490 void DPSOFTRAST_VertexShader_LightDirection(void)
3491 {
3492         int i;
3493         int numvertices = dpsoftrast.numvertices;
3494         float LightDir[4];
3495         float LightVector[4];
3496         float EyePosition[4];
3497         float EyeVectorModelSpace[4];
3498         float EyeVector[4];
3499         float position[4];
3500         float svector[4];
3501         float tvector[4];
3502         float normal[4];
3503         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3504         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3505         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3506         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3507         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3508         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3509         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3510         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3511         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3512         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3513         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3514         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3515         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3516         for (i = 0;i < numvertices;i++)
3517         {
3518                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3519                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3520                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3521                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3522                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3523                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3524                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3525                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3526                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3527                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3528                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3529                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3530                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3531                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3532                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3533                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3534                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3535                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3537                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3538                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3539                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3540                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3541                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3542                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3545                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3547         }
3548         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3549 }
3550
3551 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3552 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3553 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3554 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3555 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3556 #define DPSOFTRAST_Vector3Normalize(v)\
3557 do\
3558 {\
3559         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3560         if (len)\
3561         {\
3562                 len = 1.0f / len;\
3563                 v[0] *= len;\
3564                 v[1] *= len;\
3565                 v[2] *= len;\
3566         }\
3567 }\
3568 while(0)
3569
3570 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3571 {
3572         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3573         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580         int x, startx = span->startx, endx = span->endx;
3581         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3582         float LightVectordata[4];
3583         float LightVectorslope[4];
3584         float EyeVectordata[4];
3585         float EyeVectorslope[4];
3586         float z;
3587         float diffusetex[4];
3588         float glosstex[4];
3589         float surfacenormal[4];
3590         float lightnormal[4];
3591         float eyenormal[4];
3592         float specularnormal[4];
3593         float diffuse;
3594         float specular;
3595         float SpecularPower;
3596         int d[4];
3597         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3598         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3599         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3600         Color_Glow[3] = 0.0f;
3601         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3602         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3603         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3604         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3605         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3606         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3607         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3608         Color_Pants[3] = 0.0f;
3609         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3610         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3611         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3612         Color_Shirt[3] = 0.0f;
3613         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3614         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3616         {
3617                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3618                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3619         }
3620         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3621         {
3622                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3623         }
3624         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3625         {
3626                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3627                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3628                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3629                 Color_Diffuse[3] = 0.0f;
3630                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3631                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3632                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3633                 LightColor[3] = 0.0f;
3634                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3635                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3637                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3638                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3639                 Color_Specular[3] = 0.0f;
3640                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3641                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3642                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3643                 for (x = startx;x < endx;x++)
3644                 {
3645                         z = buffer_z[x];
3646                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3647                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3648                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3649                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3650                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3651                         {
3652                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3653                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3654                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3655                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3656                         }
3657                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3658                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3659                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3660                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3661                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3662                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3663                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3664                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3665
3666                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3667                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3668                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3669                         DPSOFTRAST_Vector3Normalize(lightnormal);
3670
3671                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3672                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3673                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3674                         DPSOFTRAST_Vector3Normalize(eyenormal);
3675
3676                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3677                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3678                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3679                         DPSOFTRAST_Vector3Normalize(specularnormal);
3680
3681                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3682                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3683                         specular = pow(specular, SpecularPower * glosstex[3]);
3684                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3685                         {
3686                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3687                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3688                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3689                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3690                         }
3691                         else
3692                         {
3693                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3694                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3695                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3696                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3697                         }
3698                         buffer_FragColorbgra8[x*4+0] = d[0];
3699                         buffer_FragColorbgra8[x*4+1] = d[1];
3700                         buffer_FragColorbgra8[x*4+2] = d[2];
3701                         buffer_FragColorbgra8[x*4+3] = d[3];
3702                 }
3703         }
3704         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3705         {
3706                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3707                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3708                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3709                 Color_Diffuse[3] = 0.0f;
3710                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3711                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3712                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3713                 LightColor[3] = 0.0f;
3714                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3715                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716                 for (x = startx;x < endx;x++)
3717                 {
3718                         z = buffer_z[x];
3719                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3720                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3721                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3722                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3723                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3724                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3725                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3726                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3727
3728                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3729                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3730                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3731                         DPSOFTRAST_Vector3Normalize(lightnormal);
3732
3733                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3734                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3735                         {
3736                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3737                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3738                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3739                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3740                         }
3741                         else
3742                         {
3743                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3744                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3745                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3746                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3747                         }
3748                         buffer_FragColorbgra8[x*4+0] = d[0];
3749                         buffer_FragColorbgra8[x*4+1] = d[1];
3750                         buffer_FragColorbgra8[x*4+2] = d[2];
3751                         buffer_FragColorbgra8[x*4+3] = d[3];
3752                 }
3753         }
3754         else
3755         {
3756                 for (x = startx;x < endx;x++)
3757                 {
3758                         z = buffer_z[x];
3759                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3760                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3761                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3762                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3763
3764                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3765                         {
3766                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3767                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3768                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3769                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3770                         }
3771                         else
3772                         {
3773                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3774                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3775                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3776                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3777                         }
3778                         buffer_FragColorbgra8[x*4+0] = d[0];
3779                         buffer_FragColorbgra8[x*4+1] = d[1];
3780                         buffer_FragColorbgra8[x*4+2] = d[2];
3781                         buffer_FragColorbgra8[x*4+3] = d[3];
3782                 }
3783         }
3784         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3785 }
3786
3787
3788
3789 void DPSOFTRAST_VertexShader_LightSource(void)
3790 {
3791         int i;
3792         int numvertices = dpsoftrast.numvertices;
3793         float LightPosition[4];
3794         float LightVector[4];
3795         float LightVectorModelSpace[4];
3796         float EyePosition[4];
3797         float EyeVectorModelSpace[4];
3798         float EyeVector[4];
3799         float position[4];
3800         float svector[4];
3801         float tvector[4];
3802         float normal[4];
3803         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3804         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3805         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3806         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3807         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3808         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3809         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3810         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3811         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3812         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3813         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3814         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3815         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3816         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3817         for (i = 0;i < numvertices;i++)
3818         {
3819                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3820                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3821                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3822                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3823                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3824                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3825                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3826                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3827                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3828                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3829                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3830                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3831                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3832                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3833                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3834                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3835                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3836                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3837                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3838                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3839                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3840                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3841                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3842                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3843                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3844                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3845                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3846                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3847                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3848                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3849                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3850                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3851         }
3852         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3853         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3854 }
3855
3856 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3857 {
3858 #ifdef SSE2_PRESENT
3859         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3860         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3861         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3862         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3863         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3864         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3865         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3866         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3867         int x, startx = span->startx, endx = span->endx;
3868         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3869         float CubeVectordata[4];
3870         float CubeVectorslope[4];
3871         float LightVectordata[4];
3872         float LightVectorslope[4];
3873         float EyeVectordata[4];
3874         float EyeVectorslope[4];
3875         float z;
3876         float diffusetex[4];
3877         float glosstex[4];
3878         float surfacenormal[4];
3879         float lightnormal[4];
3880         float eyenormal[4];
3881         float specularnormal[4];
3882         float diffuse;
3883         float specular;
3884         float SpecularPower;
3885         float CubeVector[4];
3886         float attenuation;
3887         int d[4];
3888         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3889         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3890         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3891         Color_Glow[3] = 0.0f;
3892         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3893         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3894         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3895         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3896         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3897         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3898         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3899         Color_Diffuse[3] = 0.0f;
3900         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3901         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3902         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3903         Color_Specular[3] = 0.0f;
3904         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3905         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3906         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3907         Color_Pants[3] = 0.0f;
3908         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3909         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3910         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3911         Color_Shirt[3] = 0.0f;
3912         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3913         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3914         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3915         LightColor[3] = 0.0f;
3916         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3917         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3918         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3919         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3920         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3921         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3922         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3923         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3924         {
3925                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927         }
3928         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3929                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3930         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3931         {
3932                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3933                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3934                 for (x = startx;x < endx;x++)
3935                 {
3936                         z = buffer_z[x];
3937                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3938                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3939                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3940                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3941                         if (attenuation < 0.01f)
3942                                 continue;
3943                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3944                         {
3945                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3946                                 if (attenuation < 0.01f)
3947                                         continue;
3948                         }
3949
3950                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3951                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3952                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3953                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3954                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3955                         {
3956                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3957                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3958                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3959                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3960                         }
3961                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3962                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3963                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3964                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3965                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3966                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3967                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3968                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3969
3970                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3971                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3972                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3973                         DPSOFTRAST_Vector3Normalize(lightnormal);
3974
3975                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3976                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3977                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3978                         DPSOFTRAST_Vector3Normalize(eyenormal);
3979
3980                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3981                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3982                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3983                         DPSOFTRAST_Vector3Normalize(specularnormal);
3984
3985                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3986                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3987                         specular = pow(specular, SpecularPower * glosstex[3]);
3988                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3989                         {
3990                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3991                                 attenuation *= (1.0f / 255.0f);
3992                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3993                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3994                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3995                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3996                         }
3997                         else
3998                         {
3999                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4000                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4001                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4002                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4003                         }
4004                         buffer_FragColorbgra8[x*4+0] = d[0];
4005                         buffer_FragColorbgra8[x*4+1] = d[1];
4006                         buffer_FragColorbgra8[x*4+2] = d[2];
4007                         buffer_FragColorbgra8[x*4+3] = d[3];
4008                 }
4009         }
4010         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4011         {
4012                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4013                 for (x = startx;x < endx;x++)
4014                 {
4015                         z = buffer_z[x];
4016                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4017                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4018                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4019                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4020                         if (attenuation < 0.01f)
4021                                 continue;
4022                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4023                         {
4024                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4025                                 if (attenuation < 0.01f)
4026                                         continue;
4027                         }
4028
4029                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4030                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4031                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4032                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4033                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4034                         {
4035                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4036                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4037                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4038                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4039                         }
4040                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4041                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4042                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4043                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4044
4045                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4046                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4047                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4048                         DPSOFTRAST_Vector3Normalize(lightnormal);
4049
4050                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4051                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4052                         {
4053                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4054                                 attenuation *= (1.0f / 255.0f);
4055                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4056                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4057                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4058                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4059                         }
4060                         else
4061                         {
4062                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4063                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4064                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4065                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4066                         }
4067                         buffer_FragColorbgra8[x*4+0] = d[0];
4068                         buffer_FragColorbgra8[x*4+1] = d[1];
4069                         buffer_FragColorbgra8[x*4+2] = d[2];
4070                         buffer_FragColorbgra8[x*4+3] = d[3];
4071                 }
4072         }
4073         else
4074         {
4075                 for (x = startx;x < endx;x++)
4076                 {
4077                         z = buffer_z[x];
4078                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4079                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4080                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4081                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4082                         if (attenuation < 0.01f)
4083                                 continue;
4084                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4085                         {
4086                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4087                                 if (attenuation < 0.01f)
4088                                         continue;
4089                         }
4090
4091                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4092                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4093                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4094                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4095                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4096                         {
4097                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4098                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4099                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4100                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4101                         }
4102                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4103                         {
4104                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4105                                 attenuation *= (1.0f / 255.0f);
4106                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4107                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4108                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4109                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4110                         }
4111                         else
4112                         {
4113                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4114                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4115                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4116                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4117                         }
4118                         buffer_FragColorbgra8[x*4+0] = d[0];
4119                         buffer_FragColorbgra8[x*4+1] = d[1];
4120                         buffer_FragColorbgra8[x*4+2] = d[2];
4121                         buffer_FragColorbgra8[x*4+3] = d[3];
4122                 }
4123         }
4124         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4125 #endif
4126 }
4127
4128
4129
4130 void DPSOFTRAST_VertexShader_Refraction(void)
4131 {
4132         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4133 }
4134
4135 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4136 {
4137         // TODO: IMPLEMENT
4138         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4139         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4140         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4141         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4142         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4143 }
4144
4145
4146
4147 void DPSOFTRAST_VertexShader_Water(void)
4148 {
4149         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4150 }
4151
4152
4153 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4154 {
4155         // TODO: IMPLEMENT
4156         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4159         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4160         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4161 }
4162
4163
4164
4165 void DPSOFTRAST_VertexShader_ShowDepth(void)
4166 {
4167         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4168 }
4169
4170 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4171 {
4172         // TODO: IMPLEMENT
4173         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4174         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4175         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4176         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4177         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4178 }
4179
4180
4181
4182 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4183 {
4184         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4185 }
4186
4187 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4188 {
4189         // TODO: IMPLEMENT
4190         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4191         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4192         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4193         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4194         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4195 }
4196
4197
4198
4199 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4200 {
4201         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4202 }
4203
4204 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4205 {
4206         // TODO: IMPLEMENT
4207         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4208         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4209         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4210         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4211         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4212 }
4213
4214
4215
4216 typedef struct DPSOFTRAST_ShaderModeInfo_s
4217 {
4218         int lodarrayindex;
4219         void (*Vertex)(void);
4220         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4221         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4222         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4223 }
4224 DPSOFTRAST_ShaderModeInfo;
4225
4226 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4227 {
4228         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4229         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4230         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4231         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4232         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4233         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4234         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4235         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4236         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4237         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4238         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4239         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4240         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4241         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4242         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4243         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4244 };
4245
4246 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4247 {
4248         int i;
4249         int x;
4250         int startx;
4251         int endx;
4252 //      unsigned int c;
4253 //      unsigned int *colorpixel;
4254         unsigned int *depthpixel;
4255         float w;
4256         float wslope;
4257         int depth;
4258         int depthslope;
4259         unsigned int d;
4260         DPSOFTRAST_State_Triangle *triangle;
4261         DPSOFTRAST_State_Span *span;
4262         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4263         for (i = 0; i < thread->numspans; i++)
4264         {
4265                 span = &thread->spans[i];
4266                 triangle = &thread->triangles[span->triangle];
4267                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4268                 {
4269                         wslope = triangle->w[0];
4270                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4271                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4272                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4273                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4274                         startx = span->startx;
4275                         endx = span->endx;
4276                         switch(thread->fb_depthfunc)
4277                         {
4278                         default:
4279                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4280                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4281                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4282                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4283                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4284                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4285                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4286                         }
4287                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4288                         //for (x = startx;x < endx;x++)
4289                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4290                         // if there is no color buffer, skip pixel shader
4291                         while (startx < endx && !pixelmask[startx])
4292                                 startx++;
4293                         while (endx > startx && !pixelmask[endx-1])
4294                                 endx--;
4295                         if (startx >= endx)
4296                                 continue; // no pixels to fill
4297                         span->pixelmask = pixelmask;
4298                         span->startx = startx;
4299                         span->endx = endx;
4300                         // run pixel shader if appropriate
4301                         // do this before running depthmask code, to allow the pixelshader
4302                         // to clear pixelmask values for alpha testing
4303                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4304                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4305                         if (thread->depthmask)
4306                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4307                                         if (pixelmask[x])
4308                                                 depthpixel[x] = d;
4309                 }
4310                 else
4311                 {
4312                         // no depth testing means we're just dealing with color...
4313                         // if there is no color buffer, skip pixel shader
4314                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4315                         {
4316                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4317                                 span->pixelmask = pixelmask;
4318                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4319                         }
4320                 }
4321         }
4322         thread->numspans = 0;
4323 }
4324
4325 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4326
4327 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4328 {
4329 #ifdef SSE2_PRESENT
4330         int cullface = thread->cullface;
4331         int minx, maxx, miny, maxy;
4332         int miny1, maxy1, miny2, maxy2;
4333         __m128i fbmin, fbmax;
4334         __m128 viewportcenter, viewportscale;
4335         int firstvertex = command->firstvertex;
4336         int numvertices = command->numvertices;
4337         int numtriangles = command->numtriangles;
4338         const int *element3i = command->element3i;
4339         const unsigned short *element3s = command->element3s;
4340         int clipped = command->clipped;
4341         int i;
4342         int j;
4343         int k;
4344         int y;
4345         int e[3];
4346         __m128i screeny;
4347         int starty, endy, bandy;
4348         int numpoints;
4349         int clipcase;
4350         float clipdist[4];
4351         __m128 triangleedge1, triangleedge2, trianglenormal;
4352         __m128 clipfrac[3];
4353         __m128 screen[4];
4354         DPSOFTRAST_State_Triangle *triangle;
4355         DPSOFTRAST_Texture *texture;
4356         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4357         miny = thread->fb_scissor[1];
4358         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4359         miny1 = bound(miny, thread->miny1, maxy);
4360         maxy1 = bound(miny, thread->maxy1, maxy);
4361         miny2 = bound(miny, thread->miny2, maxy);
4362         maxy2 = bound(miny, thread->maxy2, maxy);
4363         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4364         {
4365                 if (!ATOMIC_DECREMENT(command->refcount))
4366                 {
4367                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4368                                 MM_FREE(command->arrays);
4369                 }
4370                 return;
4371         }
4372         minx = thread->fb_scissor[0];
4373         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4374         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4375         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4376         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4377         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4378         screen[3] = _mm_setzero_ps();
4379         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4380         for (i = 0;i < numtriangles;i++)
4381         {
4382                 const float *screencoord4f = command->arrays;
4383                 const float *arrays = screencoord4f + numvertices*4;
4384
4385                 // generate the 3 edges of this triangle
4386                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4387                 if (element3s)
4388                 {
4389                         e[0] = element3s[i*3+0] - firstvertex;
4390                         e[1] = element3s[i*3+1] - firstvertex;
4391                         e[2] = element3s[i*3+2] - firstvertex;
4392                 }
4393                 else if (element3i)
4394                 {
4395                         e[0] = element3i[i*3+0] - firstvertex;
4396                         e[1] = element3i[i*3+1] - firstvertex;
4397                         e[2] = element3i[i*3+2] - firstvertex;
4398                 }
4399                 else
4400                 {
4401                         e[0] = i*3+0;
4402                         e[1] = i*3+1;
4403                         e[2] = i*3+2;
4404                 }
4405
4406 #define SKIPBACKFACE \
4407                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4408                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4409                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4410                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4411                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4412                 switch(cullface) \
4413                 { \
4414                 case GL_BACK: \
4415                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4416                                 continue; \
4417                         break; \
4418                 case GL_FRONT: \
4419                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4420                                 continue; \
4421                         break; \
4422                 }
4423
4424 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4425                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4426                         { \
4427                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4428                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4429                         }
4430 #define CLIPPEDVERTEXCOPY(k,p1) \
4431                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4432
4433 #define GENATTRIBCOPY(attrib, p1) \
4434                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4435 #define GENATTRIBLERP(attrib, p1, p2) \
4436                 { \
4437                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4438                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4439                 }
4440 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4441                 switch(clipcase) \
4442                 { \
4443                 default: \
4444                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4445                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4446                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4447                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4448                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4449                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4450                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4451                 }
4452
4453                 if (! clipped)
4454                         goto notclipped;
4455
4456                 // calculate distance from nearplane
4457                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4458                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4459                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4460                 if (clipdist[0] >= 0.0f)
4461                 {
4462                         if (clipdist[1] >= 0.0f)
4463                         {
4464                                 if (clipdist[2] >= 0.0f)
4465                                 {
4466                                 notclipped:
4467                                         // triangle is entirely in front of nearplane
4468                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4469                                         SKIPBACKFACE;
4470                                         numpoints = 3;
4471                                         clipcase = 0;
4472                                 }
4473                                 else
4474                                 {
4475                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4476                                         SKIPBACKFACE;
4477                                         numpoints = 4;
4478                                         clipcase = 1;
4479                                 }
4480                         }
4481                         else
4482                         {
4483                                 if (clipdist[2] >= 0.0f)
4484                                 {
4485                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4486                                         SKIPBACKFACE;
4487                                         numpoints = 4;
4488                                         clipcase = 2;
4489                                 }
4490                                 else
4491                                 {
4492                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4493                                         SKIPBACKFACE;
4494                                         numpoints = 3;
4495                                         clipcase = 3;
4496                                 }
4497                         }
4498                 }
4499                 else if (clipdist[1] >= 0.0f)
4500                 {
4501                         if (clipdist[2] >= 0.0f)
4502                         {
4503                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4504                                 SKIPBACKFACE;
4505                                 numpoints = 4;
4506                                 clipcase = 4;
4507                         }
4508                         else
4509                         {
4510                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4511                                 SKIPBACKFACE;
4512                                 numpoints = 3;
4513                                 clipcase = 5;
4514                         }
4515                 }
4516                 else if (clipdist[2] >= 0.0f)
4517                 {
4518                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4519                         SKIPBACKFACE;
4520                         numpoints = 3;
4521                         clipcase = 6;
4522                 }
4523                 else continue; // triangle is entirely behind nearplane
4524
4525                 {
4526                         // calculate integer y coords for triangle points
4527                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4528                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4529                                         screenmin = _mm_min_epi16(screeni, screenir),
4530                                         screenmax = _mm_max_epi16(screeni, screenir);
4531                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4532                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4533                         screenmin = _mm_max_epi16(screenmin, fbmin);
4534                         screenmax = _mm_min_epi16(screenmax, fbmax);
4535                         // skip offscreen triangles
4536                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4537                                 continue;
4538                         starty = _mm_extract_epi16(screenmin, 1);
4539                         endy = _mm_extract_epi16(screenmax, 1)+1;
4540                         if (starty >= maxy1 && endy <= miny2)
4541                                 continue;
4542                         screeny = _mm_srai_epi32(screeni, 16);
4543                 }
4544
4545                 triangle = &thread->triangles[thread->numtriangles];
4546
4547                 // calculate attribute plans for triangle data...
4548                 // okay, this triangle is going to produce spans, we'd better project
4549                 // the interpolants now (this is what gives perspective texturing),
4550                 // this consists of simply multiplying all arrays by the W coord
4551                 // (which is basically 1/Z), which will be undone per-pixel
4552                 // (multiplying by Z again) to get the perspective-correct array
4553                 // values
4554                 {
4555                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4556                         __m128 mipedgescale, mipdensity;
4557                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4558                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4559                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4560                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4561                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4562                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4563                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4564                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4565                         attribedge1 = _mm_sub_ss(w0, w1);
4566                         attribedge2 = _mm_sub_ss(w2, w1);
4567                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4568                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4569                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4570                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4571                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4572                         _mm_store_ss(&triangle->w[0], attribxslope);
4573                         _mm_store_ss(&triangle->w[1], attribyslope);
4574                         _mm_store_ss(&triangle->w[2], attriborigin);
4575                         mipedgescale = _mm_setzero_ps();
4576                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4577                         {
4578                                 __m128 attrib0, attrib1, attrib2;
4579                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4580                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4581                                         break;
4582                                 arrays += numvertices*4;
4583                                 GENATTRIBS(attrib0, attrib1, attrib2);
4584                                 attriborigin = _mm_mul_ps(attrib1, w1);
4585                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4586                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4587                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4588                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4589                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4590                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4591                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4592                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4593                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4594                                 {
4595                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4596                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4597                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4598                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4599                                 }
4600                         }
4601
4602                         memset(triangle->mip, 0, sizeof(triangle->mip));
4603                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4604                         {
4605                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4606                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4607                                         break;
4608                                 texture = thread->texbound[texunit];
4609                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4610                                 {
4611                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4612                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4613                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4614                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4615                                         // this will be multiplied in the texturing routine by the texture resolution
4616                                         y = _mm_cvtss_si32(mipdensity);
4617                                         if (y > 0)
4618                                         {
4619                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4620                                                 if (y > texture->mipmaps - 1)
4621                                                         y = texture->mipmaps - 1;
4622                                                 triangle->mip[texunit] = y;
4623                                         }
4624                                 }
4625                         }
4626                 }
4627         
4628                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4629                 for (; y < bandy;)
4630                 {
4631                         __m128 xcoords, xslope;
4632                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4633                         int yccmask = _mm_movemask_epi8(ycc);
4634                         int edge0p, edge0n, edge1p, edge1n;
4635                         int nexty;
4636                         if (numpoints == 4)
4637                         {
4638                                 switch(yccmask)
4639                                 {
4640                                 default:
4641                                 case 0xFFFF: /*0000*/ y = endy; continue;
4642                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4643                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4644                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4645                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4646                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4647                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4648                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4649                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4650                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4651                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4652                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4653                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4654                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4655                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4656                                 case 0x0000: /*1111*/ y++; continue;
4657                                 }
4658                         }
4659                         else
4660                         {
4661                                 switch(yccmask)
4662                                 {
4663                                 default:
4664                                 case 0xFFFF: /*000*/ y = endy; continue;
4665                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4666                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4667                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4668                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4669                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4670                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4671                                 case 0x0000: /*111*/ y++; continue;
4672                                 }
4673                         }
4674                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4675                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4676                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4677                         nexty = _mm_extract_epi16(ycc, 0);
4678                         if (nexty >= bandy) nexty = bandy-1;
4679                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4680                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4681                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4682                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4683                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4684                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4685                         {
4686                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4687                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4688                         }
4689                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4690                         {
4691                                 int startx, endx, offset;
4692                                 startx = _mm_cvtss_si32(xcoords);
4693                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4694                                 if (startx < minx) 
4695                                 {
4696                                         if (startx < 0) startx = 0;
4697                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4698                                 }
4699                                 if (endx > maxx) endx = maxx;
4700                                 if (startx >= endx) continue;
4701                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4702                                 {
4703                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4704                                         span->triangle = thread->numtriangles;
4705                                         span->x = offset;
4706                                         span->y = y;
4707                                         span->startx = max(minx - offset, 0);
4708                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4709                                         if (span->startx >= span->endx)
4710                                                 continue; 
4711                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4712                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4713                                 }
4714                         }
4715                 }
4716
4717                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4718                 {
4719                         DPSOFTRAST_Draw_ProcessSpans(thread);
4720                         thread->numtriangles = 0;
4721                 }
4722         }
4723
4724         if (!ATOMIC_DECREMENT(command->refcount))
4725         {
4726                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4727                         MM_FREE(command->arrays);
4728         }
4729
4730         if (thread->numspans > 0 || thread->numtriangles > 0)
4731         {
4732                 DPSOFTRAST_Draw_ProcessSpans(thread);
4733                 thread->numtriangles = 0;
4734         }
4735 #endif
4736 }
4737
4738 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4739 {
4740         int i;
4741         int j;
4742         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4743         int datasize = 2*numvertices*sizeof(float[4]);
4744         DPSOFTRAST_Command_Draw *command;
4745         unsigned char *data;
4746         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4747         {
4748                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4749                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4750                         break;
4751                 datasize += numvertices*sizeof(float[4]);
4752         }
4753         if (element3s)
4754                 datasize += numtriangles*sizeof(unsigned short[3]);
4755         else if (element3i)
4756                 datasize += numtriangles*sizeof(int[3]);
4757         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4758         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4759         {
4760                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4761                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4762         }
4763         else
4764         {
4765                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4766                 data = (unsigned char *)command + commandsize;
4767         }
4768         command->firstvertex = firstvertex;
4769         command->numvertices = numvertices;
4770         command->numtriangles = numtriangles;
4771         command->arrays = (float *)data;
4772         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4773         dpsoftrast.firstvertex = firstvertex;
4774         dpsoftrast.numvertices = numvertices;
4775         dpsoftrast.screencoord4f = (float *)data;
4776         data += numvertices*sizeof(float[4]);
4777         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4778         data += numvertices*sizeof(float[4]);
4779         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4780         {
4781                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4782                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4783                         break;
4784                 dpsoftrast.post_array4f[j] = (float *)data;
4785                 data += numvertices*sizeof(float[4]);
4786         }
4787         command->element3i = NULL;
4788         command->element3s = NULL;
4789         if (element3s)
4790         {
4791                 command->element3s = (unsigned short *)data;
4792                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4793         }
4794         else if (element3i)
4795         {
4796                 command->element3i = (int *)data;
4797                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4798         }
4799         return command;
4800 }
4801
4802 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4803 {
4804         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4805         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4806         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4807         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4808         if (command->starty >= command->endy)
4809         {
4810                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4811                         MM_FREE(command->arrays);
4812                 DPSOFTRAST_UndoCommand(command->commandsize);
4813                 return;
4814         }
4815         command->clipped = dpsoftrast.drawclipped;
4816         command->refcount = dpsoftrast.numthreads;
4817
4818         if (dpsoftrast.usethreads)
4819         {
4820                 int i;
4821                 DPSOFTRAST_Draw_SyncCommands();
4822                 for (i = 0; i < dpsoftrast.numthreads; i++)
4823                 {
4824                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4825                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4826                                 Thread_CondSignal(thread->drawcond);
4827                 }
4828         }
4829         else
4830         {
4831                 DPSOFTRAST_Draw_FlushThreads();
4832         }
4833 }
4834  
4835 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4836 {
4837         int commandoffset = thread->commandoffset;
4838         while (commandoffset != endoffset)
4839         {
4840                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4841                 switch (command->opcode)
4842                 {
4843 #define INTERPCOMMAND(name) \
4844                 case DPSOFTRAST_OPCODE_##name : \
4845                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4846                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4847                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4848                                 commandoffset = 0; \
4849                         break;
4850                 INTERPCOMMAND(Viewport)
4851                 INTERPCOMMAND(ClearColor)
4852                 INTERPCOMMAND(ClearDepth)
4853                 INTERPCOMMAND(ColorMask)
4854                 INTERPCOMMAND(DepthTest)
4855                 INTERPCOMMAND(ScissorTest)
4856                 INTERPCOMMAND(Scissor)
4857                 INTERPCOMMAND(BlendFunc)
4858                 INTERPCOMMAND(BlendSubtract)
4859                 INTERPCOMMAND(DepthMask)
4860                 INTERPCOMMAND(DepthFunc)
4861                 INTERPCOMMAND(DepthRange)
4862                 INTERPCOMMAND(PolygonOffset)
4863                 INTERPCOMMAND(CullFace)
4864                 INTERPCOMMAND(AlphaTest)
4865                 INTERPCOMMAND(AlphaFunc)
4866                 INTERPCOMMAND(SetTexture)
4867                 INTERPCOMMAND(SetShader)
4868                 INTERPCOMMAND(Uniform4f)
4869                 INTERPCOMMAND(UniformMatrix4f)
4870                 INTERPCOMMAND(Uniform1i)
4871
4872                 case DPSOFTRAST_OPCODE_Draw:
4873                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4874                         commandoffset += command->commandsize;
4875                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4876                                 commandoffset = 0;
4877                         thread->commandoffset = commandoffset;
4878                         break;
4879
4880                 case DPSOFTRAST_OPCODE_Reset:
4881                         commandoffset = 0;
4882                         break;
4883                 }
4884         }
4885         thread->commandoffset = commandoffset;
4886 }
4887
4888 static int DPSOFTRAST_Draw_Thread(void *data)
4889 {
4890         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4891         while(thread->index >= 0)
4892         {
4893                 if (thread->commandoffset != dpsoftrast.drawcommand)
4894                 {
4895                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4896                 }
4897                 else 
4898                 {
4899                         Thread_LockMutex(thread->drawmutex);
4900                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4901                         {
4902                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4903                                 thread->starving = true;
4904                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4905                                 thread->starving = false;
4906                         }
4907                         Thread_UnlockMutex(thread->drawmutex);
4908                 }
4909         }   
4910         return 0;
4911 }
4912
4913 static void DPSOFTRAST_Draw_FlushThreads(void)
4914 {
4915         DPSOFTRAST_State_Thread *thread;
4916         int i;
4917         DPSOFTRAST_Draw_SyncCommands();
4918         if (dpsoftrast.usethreads) 
4919         {
4920                 for (i = 0; i < dpsoftrast.numthreads; i++)
4921                 {
4922                         thread = &dpsoftrast.threads[i];
4923                         if (thread->commandoffset != dpsoftrast.drawcommand)
4924                         {
4925                                 Thread_LockMutex(thread->drawmutex);
4926                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4927                                         Thread_CondSignal(thread->drawcond);
4928                                 Thread_UnlockMutex(thread->drawmutex);
4929                         }
4930                 }
4931                 for (i = 0; i < dpsoftrast.numthreads; i++)
4932                 {
4933                         thread = &dpsoftrast.threads[i];
4934                         if (thread->commandoffset != dpsoftrast.drawcommand)
4935                         {
4936                                 Thread_LockMutex(thread->drawmutex);
4937                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4938                                 {
4939                                         thread->waiting = true;
4940                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4941                                         thread->waiting = false;
4942                                 }
4943                                 Thread_UnlockMutex(thread->drawmutex);
4944                         }
4945                 }
4946         }
4947         else
4948         {
4949                 for (i = 0; i < dpsoftrast.numthreads; i++)
4950                 {
4951                         thread = &dpsoftrast.threads[i];
4952                         if (thread->commandoffset != dpsoftrast.drawcommand)
4953                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4954                 }
4955         }
4956         dpsoftrast.commandpool.usedcommands = 0;
4957 }
4958
4959 void DPSOFTRAST_Flush(void)
4960 {
4961         DPSOFTRAST_Draw_FlushThreads();
4962 }
4963
4964 void DPSOFTRAST_Finish(void)
4965 {
4966         DPSOFTRAST_Flush();
4967 }
4968
4969 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4970 {
4971         int i;
4972         union
4973         {
4974                 int i;
4975                 unsigned char b[4];
4976         }
4977         u;
4978         u.i = 1;
4979         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4980         dpsoftrast.bigendian = u.b[3];
4981         dpsoftrast.fb_width = width;
4982         dpsoftrast.fb_height = height;
4983         dpsoftrast.fb_depthpixels = depthpixels;
4984         dpsoftrast.fb_colorpixels[0] = colorpixels;
4985         dpsoftrast.fb_colorpixels[1] = NULL;
4986         dpsoftrast.fb_colorpixels[1] = NULL;
4987         dpsoftrast.fb_colorpixels[1] = NULL;
4988         dpsoftrast.viewport[0] = 0;
4989         dpsoftrast.viewport[1] = 0;
4990         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4991         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4992         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4993         dpsoftrast.texture_firstfree = 1;
4994         dpsoftrast.texture_end = 1;
4995         dpsoftrast.texture_max = 0;
4996         dpsoftrast.color[0] = 1;
4997         dpsoftrast.color[1] = 1;
4998         dpsoftrast.color[2] = 1;
4999         dpsoftrast.color[3] = 1;
5000         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5001         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5002         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5003         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5004         for (i = 0; i < dpsoftrast.numthreads; i++)
5005         {
5006                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5007                 thread->index = i;
5008                 thread->cullface = GL_BACK;
5009                 thread->colormask[1] = 1;
5010                 thread->colormask[2] = 1;
5011                 thread->colormask[3] = 1;
5012                 thread->blendfunc[0] = GL_ONE;
5013                 thread->blendfunc[1] = GL_ZERO;
5014                 thread->depthmask = true;
5015                 thread->depthtest = true;
5016                 thread->depthfunc = GL_LEQUAL;
5017                 thread->scissortest = false;
5018                 thread->alphatest = false;
5019                 thread->alphafunc = GL_GREATER;
5020                 thread->alphavalue = 0.5f;
5021                 thread->viewport[0] = 0;
5022                 thread->viewport[1] = 0;
5023                 thread->viewport[2] = dpsoftrast.fb_width;
5024                 thread->viewport[3] = dpsoftrast.fb_height;
5025                 thread->scissor[0] = 0;
5026                 thread->scissor[1] = 0;
5027                 thread->scissor[2] = dpsoftrast.fb_width;
5028                 thread->scissor[3] = dpsoftrast.fb_height;
5029                 thread->depthrange[0] = 0;
5030                 thread->depthrange[1] = 1;
5031                 thread->polygonoffset[0] = 0;
5032                 thread->polygonoffset[1] = 0;
5033         
5034                 if (dpsoftrast.interlace)
5035                 {
5036                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5037                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5038                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5039                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5040                 }
5041                 else
5042                 {
5043                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5044                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5045                 }
5046
5047                 thread->numspans = 0;
5048                 thread->numtriangles = 0;
5049                 thread->commandoffset = 0;
5050                 thread->waiting = false;
5051                 thread->starving = false;
5052            
5053                 thread->validate = -1;
5054                 DPSOFTRAST_Validate(thread, -1);
5055  
5056                 if (dpsoftrast.usethreads)
5057                 {
5058                         thread->waitcond = Thread_CreateCond();
5059                         thread->drawcond = Thread_CreateCond();
5060                         thread->drawmutex = Thread_CreateMutex();
5061                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5062                 }
5063         }
5064         return 0;
5065 }
5066
5067 void DPSOFTRAST_Shutdown(void)
5068 {
5069         int i;
5070         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5071         {
5072                 DPSOFTRAST_State_Thread *thread;
5073                 for (i = 0; i < dpsoftrast.numthreads; i++)
5074                 {
5075                         thread = &dpsoftrast.threads[i];
5076                         Thread_LockMutex(thread->drawmutex);
5077                         thread->index = -1;
5078                         Thread_CondSignal(thread->drawcond);
5079                         Thread_UnlockMutex(thread->drawmutex);
5080                         Thread_WaitThread(thread->thread, 0);
5081                         Thread_DestroyCond(thread->waitcond);
5082                         Thread_DestroyCond(thread->drawcond);
5083                         Thread_DestroyMutex(thread->drawmutex);
5084                 }
5085         }
5086         for (i = 0;i < dpsoftrast.texture_end;i++)
5087                 if (dpsoftrast.texture[i].bytes)
5088                         MM_FREE(dpsoftrast.texture[i].bytes);
5089         if (dpsoftrast.texture)
5090                 free(dpsoftrast.texture);
5091         if (dpsoftrast.threads)
5092                 MM_FREE(dpsoftrast.threads);
5093         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5094 }
5095