]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
implemented r_shadow_particletrace cvar which enables an exceptionally
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__GNUC__)
18                 #define ALIGN(var) var __attribute__((__aligned__(16)))
19                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20                 #define MEMORY_BARRIER (_mm_sfence())
21                 //(__sync_synchronize())
22                 #define ATOMIC_COUNTER volatile int
23                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26         #elif defined(_MSC_VER)
27                 #define ALIGN(var) __declspec(align(16)) var
28                 #define ATOMIC(var) __declspec(align(32)) var
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(MemoryBarrier())
31                 #define ATOMIC_COUNTER volatile LONG
32                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34                 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
35         #endif
36 #endif
37
38 #ifndef ALIGN
39 #define ALIGN(var) var
40 #endif
41 #ifndef ATOMIC
42 #define ATOMIC(var) var
43 #endif
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
46 #endif
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
49 #endif
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
52 #endif
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
55 #endif
56 #ifndef ATOMIC_ADD
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
58 #endif
59
60 #ifdef SSE2_PRESENT
61 #include <emmintrin.h>
62
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
64
65 static void *MM_CALLOC(size_t nmemb, size_t size)
66 {
67         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68         if (ptr != NULL) memset(ptr, 0, nmemb*size);
69         return ptr;
70 }
71
72 #define MM_FREE _mm_free
73 #else
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
76 #define MM_FREE free
77 #endif
78
79 typedef enum DPSOFTRAST_ARRAY_e
80 {
81         DPSOFTRAST_ARRAY_POSITION,
82         DPSOFTRAST_ARRAY_COLOR,
83         DPSOFTRAST_ARRAY_TEXCOORD0,
84         DPSOFTRAST_ARRAY_TEXCOORD1,
85         DPSOFTRAST_ARRAY_TEXCOORD2,
86         DPSOFTRAST_ARRAY_TEXCOORD3,
87         DPSOFTRAST_ARRAY_TEXCOORD4,
88         DPSOFTRAST_ARRAY_TEXCOORD5,
89         DPSOFTRAST_ARRAY_TEXCOORD6,
90         DPSOFTRAST_ARRAY_TEXCOORD7,
91         DPSOFTRAST_ARRAY_TOTAL
92 }
93 DPSOFTRAST_ARRAY;
94
95 typedef struct DPSOFTRAST_Texture_s
96 {
97         int flags;
98         int width;
99         int height;
100         int depth;
101         int sides;
102         DPSOFTRAST_TEXTURE_FILTER filter;
103         int mipmaps;
104         int size;
105         ATOMIC_COUNTER binds;
106         unsigned char *bytes;
107         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
108 }
109 DPSOFTRAST_Texture;
110
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
113
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
115 {
116         unsigned char opcode;
117         unsigned short commandsize;
118 }
119 DPSOFTRAST_Command);
120
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
122
123 #define DEFCOMMAND(opcodeval, name, fields) \
124         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
126         { \
127                 unsigned char opcode; \
128                 unsigned short commandsize; \
129                 fields \
130         } DPSOFTRAST_Command_##name );
131
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
134
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
136 {
137         int freecommand;
138         int usedcommands;
139         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
140 }
141 DPSOFTRAST_State_Command_Pool);
142
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
144 {
145         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
146         float w[3];
147         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
148 }
149 DPSOFTRAST_State_Triangle);
150
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
156 }
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
166 }
167                                         
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
169
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
171 {
172         int triangle; // triangle this span was generated by
173         int x; // framebuffer x coord
174         int y; // framebuffer y coord
175         int startx; // usable range (according to pixelmask)
176         int endx; // usable range (according to pixelmask)
177         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
178 }
179 DPSOFTRAST_State_Span);
180
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
183
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
188
189 typedef enum DPSOFTRAST_BLENDMODE_e
190 {
191         DPSOFTRAST_BLENDMODE_OPAQUE,
192         DPSOFTRAST_BLENDMODE_ALPHA,
193         DPSOFTRAST_BLENDMODE_ADDALPHA,
194         DPSOFTRAST_BLENDMODE_ADD,
195         DPSOFTRAST_BLENDMODE_INVMOD,
196         DPSOFTRAST_BLENDMODE_MUL,
197         DPSOFTRAST_BLENDMODE_MUL2,
198         DPSOFTRAST_BLENDMODE_SUBALPHA,
199         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200         DPSOFTRAST_BLENDMODE_INVADD,
201         DPSOFTRAST_BLENDMODE_TOTAL
202 }
203 DPSOFTRAST_BLENDMODE;
204
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
206 {
207         void *thread;
208         int index;
209         
210         int cullface;
211         int colormask[4];
212         int blendfunc[2];
213         int blendsubtract;
214         int depthmask;
215         int depthtest;
216         int depthfunc;
217         int scissortest;
218         int alphatest;
219         int alphafunc;
220         float alphavalue;
221         int viewport[4];
222         int scissor[4];
223         float depthrange[2];
224         float polygonoffset[2];
225
226         int shader_mode;
227         int shader_permutation;
228
229         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
230         
231         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
233
234         // DPSOFTRAST_VALIDATE_ flags
235         int validate;
236
237         // derived values (DPSOFTRAST_VALIDATE_FB)
238         int fb_colormask;
239         int fb_scissor[4];
240         ALIGN(float fb_viewportcenter[4]);
241         ALIGN(float fb_viewportscale[4]);
242
243         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
244         int fb_depthfunc;
245
246         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
247         int fb_blendmode;
248
249         // band boundaries
250         int miny1;
251         int maxy1;
252         int miny2;
253         int maxy2;
254
255         ATOMIC(volatile int commandoffset);
256
257         volatile bool waiting;
258         volatile bool starving;
259         void *waitcond;
260         void *drawcond;
261         void *drawmutex;
262
263         int numspans;
264         int numtriangles;
265         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
267 }
268 DPSOFTRAST_State_Thread);
269
270 typedef ATOMIC(struct DPSOFTRAST_State_s
271 {
272         int fb_width;
273         int fb_height;
274         unsigned int *fb_depthpixels;
275         unsigned int *fb_colorpixels[4];
276
277         int viewport[4];
278         ALIGN(float fb_viewportcenter[4]);
279         ALIGN(float fb_viewportscale[4]);
280
281         float color[4];
282         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
284
285         const float *pointer_vertex3f;
286         const float *pointer_color4f;
287         const unsigned char *pointer_color4ub;
288         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
289         int stride_vertex;
290         int stride_color;
291         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
294
295         int firstvertex;
296         int numvertices;
297         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298         float *screencoord4f;
299         int drawstarty;
300         int drawendy;
301         int drawclipped;
302         
303         int shader_mode;
304         int shader_permutation;
305
306         int texture_max;
307         int texture_end;
308         int texture_firstfree;
309         DPSOFTRAST_Texture *texture;
310
311         int bigendian;
312
313         // error reporting
314         const char *errorstring;
315
316         bool usethreads;
317         int interlace;
318         int numthreads;
319         DPSOFTRAST_State_Thread *threads;
320
321         ATOMIC(volatile int drawcommand);
322
323         DPSOFTRAST_State_Command_Pool commandpool;
324 }
325 DPSOFTRAST_State);
326
327 DPSOFTRAST_State dpsoftrast;
328
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
334
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
336 {
337         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339         fb_viewportcenter[3] = 0.5f;
340         fb_viewportcenter[0] = 0.0f;
341         fb_viewportscale[1] = 0.5f * viewport[2];
342         fb_viewportscale[2] = -0.5f * viewport[3];
343         fb_viewportscale[3] = 0.5f;
344         fb_viewportscale[0] = 1.0f;
345 }
346
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
348 {
349         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350         // and viewport projection values
351         int x1, x2;
352         int y1, y2;
353         x1 = thread->scissor[0];
354         x2 = thread->scissor[0] + thread->scissor[2];
355         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356         y2 = dpsoftrast.fb_height - thread->scissor[1];
357         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
358         if (x1 < 0) x1 = 0;
359         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
360         if (y1 < 0) y1 = 0;
361         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362         thread->fb_scissor[0] = x1;
363         thread->fb_scissor[1] = y1;
364         thread->fb_scissor[2] = x2 - x1;
365         thread->fb_scissor[3] = y2 - y1;
366
367         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
368 }
369
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
371 {
372         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
373 }
374
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
376 {
377         if (thread->blendsubtract)
378         {
379                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
380                 {
381                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
385                 }
386         }
387         else
388         {       
389                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390                 {
391                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
402                 }
403         }
404 }
405
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
407
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
409 {
410         mask &= thread->validate;
411         if (!mask)
412                 return;
413         if (mask & DPSOFTRAST_VALIDATE_FB)
414         {
415                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416                 DPSOFTRAST_RecalcFB(thread);
417         }
418         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
419         {
420                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421                 DPSOFTRAST_RecalcDepthFunc(thread);
422         }
423         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
424         {
425                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426                 DPSOFTRAST_RecalcBlendFunc(thread);
427         }
428 }
429
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
431 {
432         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433                 return &dpsoftrast.texture[index];
434         return NULL;
435 }
436
437 static void DPSOFTRAST_Texture_Grow(void)
438 {
439         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440         DPSOFTRAST_State_Thread *thread;
441         int i;
442         int j;
443         DPSOFTRAST_Flush();
444         // expand texture array as needed
445         if (dpsoftrast.texture_max < 1024)
446                 dpsoftrast.texture_max = 1024;
447         else
448                 dpsoftrast.texture_max *= 2;
449         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451                 if (dpsoftrast.texbound[i])
452                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453         for (j = 0; j < dpsoftrast.numthreads; j++)
454         {
455                 thread = &dpsoftrast.threads[j];
456                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                         if (thread->texbound[i])
458                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
459         }
460 }
461
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
463 {
464         int w;
465         int h;
466         int d;
467         int size;
468         int s;
469         int texnum;
470         int mipmaps;
471         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473         DPSOFTRAST_Texture *texture;
474         if (width*height*depth < 1)
475         {
476                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
477                 return 0;
478         }
479         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
480         {
481                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
482                 return 0;
483         }
484         switch(texformat)
485         {
486         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
489                 break;
490         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
492                 {
493                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
494                         return 0;
495                 }
496                 if (depth != 1)
497                 {
498                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
499                         return 0;
500                 }
501                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
502                 {
503                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
504                         return 0;
505                 }
506                 break;
507         }
508         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
509         {
510                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
511                 return 0;
512         }
513         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
514         {
515                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
516                 return 0;
517         }
518         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
519         {
520                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
521                 return 0;
522         }
523         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524         {
525                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
526                 return 0;
527         }
528         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
531                 return 0;
532         }
533         // find first empty slot in texture array
534         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535                 if (!dpsoftrast.texture[texnum].bytes)
536                         break;
537         dpsoftrast.texture_firstfree = texnum + 1;
538         if (dpsoftrast.texture_max <= texnum)
539                 DPSOFTRAST_Texture_Grow();
540         if (dpsoftrast.texture_end <= texnum)
541                 dpsoftrast.texture_end = texnum + 1;
542         texture = &dpsoftrast.texture[texnum];
543         memset(texture, 0, sizeof(*texture));
544         texture->flags = flags;
545         texture->width = width;
546         texture->height = height;
547         texture->depth = depth;
548         texture->sides = sides;
549         texture->binds = 0;
550         w = width;
551         h = height;
552         d = depth;
553         size = 0;
554         mipmaps = 0;
555         w = width;
556         h = height;
557         d = depth;
558         for (;;)
559         {
560                 s = w * h * d * sides * 4;
561                 texture->mipmap[mipmaps][0] = size;
562                 texture->mipmap[mipmaps][1] = s;
563                 texture->mipmap[mipmaps][2] = w;
564                 texture->mipmap[mipmaps][3] = h;
565                 texture->mipmap[mipmaps][4] = d;
566                 size += s;
567                 mipmaps++;
568                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569                         break;
570                 if (w > 1) w >>= 1;
571                 if (h > 1) h >>= 1;
572                 if (d > 1) d >>= 1;
573         }
574         texture->mipmaps = mipmaps;
575         texture->size = size;
576
577         // allocate the pixels now
578         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
579
580         return texnum;
581 }
582 void DPSOFTRAST_Texture_Free(int index)
583 {
584         DPSOFTRAST_Texture *texture;
585         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
586         if (texture->binds)
587                 DPSOFTRAST_Flush();
588         if (texture->bytes)
589                 MM_FREE(texture->bytes);
590         texture->bytes = NULL;
591         memset(texture, 0, sizeof(*texture));
592         // adjust the free range and used range
593         if (dpsoftrast.texture_firstfree > index)
594                 dpsoftrast.texture_firstfree = index;
595         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596                 dpsoftrast.texture_end--;
597 }
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
599 {
600         int i, x, y, z, w, layer0, layer1, row0, row1;
601         unsigned char *o, *i0, *i1, *i2, *i3;
602         DPSOFTRAST_Texture *texture;
603         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604         if (texture->mipmaps <= 1)
605                 return;
606         for (i = 1;i < texture->mipmaps;i++)
607         {
608                 for (z = 0;z < texture->mipmap[i][4];z++)
609                 {
610                         layer0 = z*2;
611                         layer1 = z*2+1;
612                         if (layer1 >= texture->mipmap[i-1][4])
613                                 layer1 = texture->mipmap[i-1][4]-1;
614                         for (y = 0;y < texture->mipmap[i][3];y++)
615                         {
616                                 row0 = y*2;
617                                 row1 = y*2+1;
618                                 if (row1 >= texture->mipmap[i-1][3])
619                                         row1 = texture->mipmap[i-1][3]-1;
620                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
621                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625                                 w = texture->mipmap[i][2];
626                                 if (layer1 > layer0)
627                                 {
628                                         if (texture->mipmap[i-1][2] > 1)
629                                         {
630                                                 // average 3D texture
631                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
632                                                 {
633                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
637                                                 }
638                                         }
639                                         else
640                                         {
641                                                 // average 3D mipmap with parent width == 1
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
648                                                 }
649                                         }
650                                 }
651                                 else
652                                 {
653                                         if (texture->mipmap[i-1][2] > 1)
654                                         {
655                                                 // average 2D texture (common case)
656                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
657                                                 {
658                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
662                                                 }
663                                         }
664                                         else
665                                         {
666                                                 // 2D texture with parent width == 1
667                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
668                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
669                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
670                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
671                                         }
672                                 }
673                         }
674                 }
675         }
676 }
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
678 {
679         DPSOFTRAST_Texture *texture;
680         unsigned char *dst;
681         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
682         if (texture->binds)
683                 DPSOFTRAST_Flush();
684         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685         while (blockheight > 0)
686         {
687                 memcpy(dst, pixels, blockwidth * 4);
688                 pixels += blockwidth * 4;
689                 dst += texture->mipmap[0][2] * 4;
690                 blockheight--;
691         }
692         DPSOFTRAST_Texture_CalculateMipmaps(index);
693 }
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
695 {
696         DPSOFTRAST_Texture *texture;
697         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
698         if (texture->binds)
699                 DPSOFTRAST_Flush();
700         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707         return texture->mipmap[mip][2];
708 }
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][3];
714 }
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][4];
720 }
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         if (texture->binds)
726                 DPSOFTRAST_Flush();
727         return texture->bytes + texture->mipmap[mip][0];
728 }
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
730 {
731         DPSOFTRAST_Texture *texture;
732         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
734         {
735                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
736                 return;
737         }
738         if (texture->binds)
739                 DPSOFTRAST_Flush();
740         texture->filter = filter;
741 }
742
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
744 {
745         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
748                 DPSOFTRAST_Flush();
749         dpsoftrast.fb_width = width;
750         dpsoftrast.fb_height = height;
751         dpsoftrast.fb_depthpixels = depthpixels;
752         dpsoftrast.fb_colorpixels[0] = colorpixels0;
753         dpsoftrast.fb_colorpixels[1] = colorpixels1;
754         dpsoftrast.fb_colorpixels[2] = colorpixels2;
755         dpsoftrast.fb_colorpixels[3] = colorpixels3;
756 }
757
758 static void DPSOFTRAST_Draw_FlushThreads(void);
759
760 static void DPSOFTRAST_Draw_SyncCommands(void)
761 {
762         if(dpsoftrast.usethreads) MEMORY_BARRIER;
763         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
764 }
765
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
767 {
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 Thread_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) Thread_CondSignal(thread->drawcond);
800                         Thread_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 Thread_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 }
807
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
812
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
814 {
815         DPSOFTRAST_Command *command;
816         int freecommand = dpsoftrast.commandpool.freecommand;
817         int usedcommands = dpsoftrast.commandpool.usedcommands;
818         int extra = sizeof(DPSOFTRAST_Command);
819         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
822         {
823                 if (dpsoftrast.usethreads)
824                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
825                 else
826                         DPSOFTRAST_Draw_FlushThreads();
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         if (freecommand < 0)
854                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855         usedcommands -= size;
856         dpsoftrast.commandpool.freecommand = freecommand;
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859                 
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
862 {
863         thread->viewport[0] = command->x;
864         thread->viewport[1] = command->y;
865         thread->viewport[2] = command->width;
866         thread->viewport[3] = command->height;
867         thread->validate |= DPSOFTRAST_VALIDATE_FB;
868 }
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
870 {
871         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
872         command->x = x;
873         command->y = y;
874         command->width = width;
875         command->height = height;
876
877         dpsoftrast.viewport[0] = x;
878         dpsoftrast.viewport[1] = y;
879         dpsoftrast.viewport[2] = width;
880         dpsoftrast.viewport[3] = height;
881         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
882 }
883
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
886 {
887         int i, x1, y1, x2, y2, w, h, x, y;
888         int miny1 = thread->miny1;
889         int maxy1 = thread->maxy1;
890         int miny2 = thread->miny2;
891         int maxy2 = thread->maxy2;
892         int bandy;
893         unsigned int *p;
894         unsigned int c;
895         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896         x1 = thread->fb_scissor[0];
897         y1 = thread->fb_scissor[1];
898         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900         if (y1 < miny1) y1 = miny1;
901         if (y2 > maxy2) y2 = maxy2;
902         w = x2 - x1;
903         h = y2 - y1;
904         if (w < 1 || h < 1)
905                 return;
906         // FIXME: honor fb_colormask?
907         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908         for (i = 0;i < 4;i++)
909         {
910                 if (!dpsoftrast.fb_colorpixels[i])
911                         continue;
912                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
913                 for (;y < bandy;y++)
914                 {
915                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916                         for (x = x1;x < x2;x++)
917                                 p[x] = c;
918                 }
919         }
920 }
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
922 {
923         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
924         command->r = r;
925         command->g = g;
926         command->b = b;
927         command->a = a;
928 }
929
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
932 {
933         int x1, y1, x2, y2, w, h, x, y;
934         int miny1 = thread->miny1;
935         int maxy1 = thread->maxy1;
936         int miny2 = thread->miny2;
937         int maxy2 = thread->maxy2;
938         int bandy;
939         unsigned int *p;
940         unsigned int c;
941         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942         x1 = thread->fb_scissor[0];
943         y1 = thread->fb_scissor[1];
944         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946         if (y1 < miny1) y1 = miny1;
947         if (y2 > maxy2) y2 = maxy2;
948         w = x2 - x1;
949         h = y2 - y1;
950         if (w < 1 || h < 1)
951                 return;
952         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
954         for (;y < bandy;y++)
955         {
956                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957                 for (x = x1;x < x2;x++)
958                         p[x] = c;
959         }
960 }
961 void DPSOFTRAST_ClearDepth(float d)
962 {
963         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
964         command->depth = d;
965 }
966
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
969 {
970         thread->colormask[0] = command->r != 0;
971         thread->colormask[1] = command->g != 0;
972         thread->colormask[2] = command->b != 0;
973         thread->colormask[3] = command->a != 0;
974         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
975 }
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
977 {
978         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
987 {
988         thread->depthtest = command->enable;
989         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
990 }
991 void DPSOFTRAST_DepthTest(int enable)
992 {
993         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994         command->enable = enable;
995 }
996
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
999 {
1000         thread->scissortest = command->enable;
1001         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1002 }
1003 void DPSOFTRAST_ScissorTest(int enable)
1004 {
1005         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006         command->enable = enable;
1007 }
1008
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1011 {
1012         thread->scissor[0] = command->x;
1013         thread->scissor[1] = command->y;
1014         thread->scissor[2] = command->width;
1015         thread->scissor[3] = command->height;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1019 {
1020         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1021         command->x = x;
1022         command->y = y;
1023         command->width = width;
1024         command->height = height;
1025 }
1026
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1029 {
1030         thread->blendfunc[0] = command->sfactor;
1031         thread->blendfunc[1] = command->dfactor;
1032         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1033 }
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1035 {
1036         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037         command->sfactor = sfactor;
1038         command->dfactor = dfactor;
1039 }
1040
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1043 {
1044         thread->blendsubtract = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1046 }
1047 void DPSOFTRAST_BlendSubtract(int enable)
1048 {
1049         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1055 {
1056         thread->depthmask = command->enable;
1057 }
1058 void DPSOFTRAST_DepthMask(int enable)
1059 {
1060         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1066 {
1067         thread->depthfunc = command->func;
1068 }
1069 void DPSOFTRAST_DepthFunc(int func)
1070 {
1071         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072         command->func = func;
1073 }
1074
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1077 {
1078         thread->depthrange[0] = command->nearval;
1079         thread->depthrange[1] = command->farval;
1080 }
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1082 {
1083         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084         command->nearval = nearval;
1085         command->farval = farval;
1086 }
1087
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1090 {
1091         thread->polygonoffset[0] = command->alongnormal;
1092         thread->polygonoffset[1] = command->intoview;
1093 }
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1095 {
1096         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097         command->alongnormal = alongnormal;
1098         command->intoview = intoview;
1099 }
1100
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1103 {
1104         thread->cullface = command->mode;
1105 }
1106 void DPSOFTRAST_CullFace(int mode)
1107 {
1108         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109         command->mode = mode;
1110 }
1111
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1114 {
1115         thread->alphatest = command->enable;
1116 }
1117 void DPSOFTRAST_AlphaTest(int enable)
1118 {
1119         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1125 {
1126         thread->alphafunc = command->func;
1127         thread->alphavalue = command->ref;
1128 }
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1130 {
1131         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132         command->func = func;
1133         command->ref = ref;
1134 }
1135
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1137 {
1138         dpsoftrast.color[0] = r;
1139         dpsoftrast.color[1] = g;
1140         dpsoftrast.color[2] = b;
1141         dpsoftrast.color[3] = a;
1142 }
1143
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1145 {
1146         int outstride = blockwidth * 4;
1147         int instride = dpsoftrast.fb_width * 4;
1148         int bx1 = blockx;
1149         int by1 = blocky;
1150         int bx2 = blockx + blockwidth;
1151         int by2 = blocky + blockheight;
1152         int bw;
1153         int x;
1154         int y;
1155         unsigned char *inpixels;
1156         unsigned char *b;
1157         unsigned char *o;
1158         DPSOFTRAST_Flush();
1159         if (bx1 < 0) bx1 = 0;
1160         if (by1 < 0) by1 = 0;
1161         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1162         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1163         bw = bx2 - bx1;
1164         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1165         if (dpsoftrast.bigendian)
1166         {
1167                 for (y = by1;y < by2;y++)
1168                 {
1169                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1170                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1171                         for (x = bx1;x < bx2;x++)
1172                         {
1173                                 o[0] = b[3];
1174                                 o[1] = b[2];
1175                                 o[2] = b[1];
1176                                 o[3] = b[0];
1177                                 o += 4;
1178                                 b += 4;
1179                         }
1180                 }
1181         }
1182         else
1183         {
1184                 for (y = by1;y < by2;y++)
1185                 {
1186                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1187                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1188                         memcpy(o, b, bw*4);
1189                 }
1190         }
1191
1192 }
1193 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1194 {
1195         int tx1 = tx;
1196         int ty1 = ty;
1197         int tx2 = tx + width;
1198         int ty2 = ty + height;
1199         int sx1 = sx;
1200         int sy1 = sy;
1201         int sx2 = sx + width;
1202         int sy2 = sy + height;
1203         int swidth;
1204         int sheight;
1205         int twidth;
1206         int theight;
1207         int sw;
1208         int sh;
1209         int tw;
1210         int th;
1211         int y;
1212         unsigned int *spixels;
1213         unsigned int *tpixels;
1214         DPSOFTRAST_Texture *texture;
1215         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1216         if (mip < 0 || mip >= texture->mipmaps) return;
1217         DPSOFTRAST_Flush();
1218         spixels = dpsoftrast.fb_colorpixels[0];
1219         swidth = dpsoftrast.fb_width;
1220         sheight = dpsoftrast.fb_height;
1221         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1222         twidth = texture->mipmap[mip][2];
1223         theight = texture->mipmap[mip][3];
1224         if (tx1 < 0) tx1 = 0;
1225         if (ty1 < 0) ty1 = 0;
1226         if (tx2 > twidth) tx2 = twidth;
1227         if (ty2 > theight) ty2 = theight;
1228         if (sx1 < 0) sx1 = 0;
1229         if (sy1 < 0) sy1 = 0;
1230         if (sx2 > swidth) sx2 = swidth;
1231         if (sy2 > sheight) sy2 = sheight;
1232         tw = tx2 - tx1;
1233         th = ty2 - ty1;
1234         sw = sx2 - sx1;
1235         sh = sy2 - sy1;
1236         if (tw > sw) tw = sw;
1237         if (th > sh) th = sh;
1238         if (tw < 1 || th < 1)
1239                 return;
1240         for (y = 0;y < th;y++)
1241                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1242         if (texture->mipmaps > 1)
1243                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1244 }
1245
1246 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1247 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1248 {
1249         if (thread->texbound[command->unitnum])
1250                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1251         thread->texbound[command->unitnum] = command->texture;
1252 }
1253 void DPSOFTRAST_SetTexture(int unitnum, int index)
1254 {
1255         DPSOFTRAST_Command_SetTexture *command;
1256         DPSOFTRAST_Texture *texture;
1257         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1258         {
1259                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1260                 return;
1261         }
1262         texture = DPSOFTRAST_Texture_GetByIndex(index);
1263         if (index && !texture)
1264         {
1265                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1266                 return;
1267         }
1268
1269         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1270         command->unitnum = unitnum;
1271         command->texture = texture;
1272
1273         dpsoftrast.texbound[unitnum] = texture;
1274         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1275 }
1276
1277 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1278 {
1279         dpsoftrast.pointer_vertex3f = vertex3f;
1280         dpsoftrast.stride_vertex = stride;
1281 }
1282 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1283 {
1284         dpsoftrast.pointer_color4f = color4f;
1285         dpsoftrast.pointer_color4ub = NULL;
1286         dpsoftrast.stride_color = stride;
1287 }
1288 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1289 {
1290         dpsoftrast.pointer_color4f = NULL;
1291         dpsoftrast.pointer_color4ub = color4ub;
1292         dpsoftrast.stride_color = stride;
1293 }
1294 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1295 {
1296         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1297         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1298         dpsoftrast.stride_texcoord[unitnum] = stride;
1299 }
1300
1301 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1302 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1303 {
1304         thread->shader_mode = command->mode;
1305         thread->shader_permutation = command->permutation;
1306 }
1307 void DPSOFTRAST_SetShader(int mode, int permutation)
1308 {
1309         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1310         command->mode = mode;
1311         command->permutation = permutation;
1312
1313         dpsoftrast.shader_mode = mode;
1314         dpsoftrast.shader_permutation = permutation;
1315 }
1316
1317 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1318 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1319 {
1320         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1321 }
1322 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1323 {
1324         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1325         command->index = index;
1326         command->val[0] = v0;
1327         command->val[1] = v1;
1328         command->val[2] = v2;
1329         command->val[3] = v3;
1330
1331         dpsoftrast.uniform4f[index*4+0] = v0;
1332         dpsoftrast.uniform4f[index*4+1] = v1;
1333         dpsoftrast.uniform4f[index*4+2] = v2;
1334         dpsoftrast.uniform4f[index*4+3] = v3;
1335 }
1336 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1337 {
1338         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1339         command->index = index;
1340         memcpy(command->val, v, sizeof(command->val));
1341
1342         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1343 }
1344
1345 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1346 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1347 {
1348         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 }
1350 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1351 {
1352 #ifdef SSE2_PRESENT
1353         int i, index;
1354         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1355         {
1356                 __m128 m0, m1, m2, m3;
1357                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1358                 command->index = (DPSOFTRAST_UNIFORM)index;
1359                 if (((size_t)v)&(ALIGN_SIZE-1))
1360                 {
1361                         m0 = _mm_loadu_ps(v);
1362                         m1 = _mm_loadu_ps(v+4);
1363                         m2 = _mm_loadu_ps(v+8);
1364                         m3 = _mm_loadu_ps(v+12);
1365                 }
1366                 else
1367                 {
1368                         m0 = _mm_load_ps(v);
1369                         m1 = _mm_load_ps(v+4);
1370                         m2 = _mm_load_ps(v+8);
1371                         m3 = _mm_load_ps(v+12);
1372                 }
1373                 if (transpose)
1374                 {
1375                         __m128 t0, t1, t2, t3;
1376                         t0 = _mm_unpacklo_ps(m0, m1);
1377                         t1 = _mm_unpacklo_ps(m2, m3);
1378                         t2 = _mm_unpackhi_ps(m0, m1);
1379                         t3 = _mm_unpackhi_ps(m2, m3);
1380                         m0 = _mm_movelh_ps(t0, t1);
1381                         m1 = _mm_movehl_ps(t1, t0);
1382                         m2 = _mm_movelh_ps(t2, t3);
1383                         m3 = _mm_movehl_ps(t3, t2);                     
1384                 }
1385                 _mm_store_ps(command->val, m0);
1386                 _mm_store_ps(command->val+4, m1);
1387                 _mm_store_ps(command->val+8, m2);
1388                 _mm_store_ps(command->val+12, m3);
1389                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1390                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1391                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1392                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1393         }
1394 #endif
1395 }
1396
1397 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1398 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1399 {
1400         thread->uniform1i[command->index] = command->val;
1401 }
1402 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1403 {
1404         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1405         command->index = index;
1406         command->val = i0;
1407
1408         dpsoftrast.uniform1i[command->index] = i0;
1409 }
1410
1411 #ifdef SSE2_PRESENT
1412 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1413 {
1414         float *end = dst + size*4;
1415         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1416         {
1417                 while (dst < end)
1418                 {
1419                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1420                         dst += 4;
1421                         src += stride;
1422                 }
1423         }
1424         else
1425         {
1426                 while (dst < end)
1427                 {
1428                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1429                         dst += 4;
1430                         src += stride;
1431                 }
1432         }
1433 }
1434
1435 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if (stride == sizeof(float[3]))
1439         {
1440                 float *end4 = dst + (size&~3)*4;        
1441                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1442                 {
1443                         while (dst < end4)
1444                         {
1445                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1446                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1447                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1453                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1454                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1455                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1456                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1457                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458                                 dst += 16;
1459                                 src += 4*sizeof(float[3]);
1460                         }
1461                 }
1462                 else
1463                 {
1464                         while (dst < end4)
1465                         {
1466                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1467                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1468                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1474                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1475                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1476                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dst += 16;
1480                                 src += 4*sizeof(float[3]);
1481                         }
1482                 }
1483         }
1484         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1485         {
1486                 while (dst < end)
1487                 {
1488                         __m128 v = _mm_loadu_ps((const float *)src);
1489                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1490                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1491                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1492                         _mm_store_ps(dst, v);
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497         else
1498         {
1499                 while (dst < end)
1500                 {
1501                         __m128 v = _mm_load_ps((const float *)src);
1502                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1503                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1505                         _mm_store_ps(dst, v);
1506                         dst += 4;
1507                         src += stride;
1508                 }
1509         }
1510 }
1511
1512 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1513 {
1514         float *end = dst + size*4;
1515         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1516         if (stride == sizeof(float[2]))
1517         {
1518                 float *end2 = dst + (size&~1)*4;
1519                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1520                 {
1521                         while (dst < end2)
1522                         {
1523                                 __m128 v = _mm_loadu_ps((const float *)src);
1524                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1525                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1526                                 dst += 8;
1527                                 src += 2*sizeof(float[2]);
1528                         }
1529                 }
1530                 else
1531                 {
1532                         while (dst < end2)
1533                         {
1534                                 __m128 v = _mm_load_ps((const float *)src);
1535                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1536                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537                                 dst += 8;
1538                                 src += 2*sizeof(float[2]);
1539                         }
1540                 }
1541         }
1542         while (dst < end)
1543         {
1544                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1545                 dst += 4;
1546                 src += stride;
1547         }
1548 }
1549
1550 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1551 {
1552         float *end = dst + size*4;
1553         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1554         if (stride == sizeof(unsigned char[4]))
1555         {
1556                 float *end4 = dst + (size&~3)*4;
1557                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1558                 {
1559                         while (dst < end4)
1560                         {
1561                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1562                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1563                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1564                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1565                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1566                                 dst += 16;
1567                                 src += 4*sizeof(unsigned char[4]);
1568                         }
1569                 }
1570                 else
1571                 {
1572                         while (dst < end4)
1573                         {
1574                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1575                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1576                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1577                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1579                                 dst += 16;
1580                                 src += 4*sizeof(unsigned char[4]);
1581                         }
1582                 }
1583         }
1584         while (dst < end)
1585         {
1586                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1587                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1588                 dst += 4;
1589                 src += stride;
1590         }
1591 }
1592
1593 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1594 {
1595         float *end = dst + 4*size;
1596         __m128 v = _mm_loadu_ps(src);
1597         while (dst < end)
1598         {
1599                 _mm_store_ps(dst, v);
1600                 dst += 4;
1601         }
1602 }
1603 #endif
1604
1605 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1606 {
1607 #ifdef SSE2_PRESENT
1608         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1609         __m128 m0, m1, m2, m3;
1610         float *end;
1611         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1612         {
1613                 // fast case for identity matrix
1614                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1615                 return;
1616         }
1617         end = out4f + numitems*4;
1618         m0 = _mm_loadu_ps(inmatrix16f);
1619         m1 = _mm_loadu_ps(inmatrix16f + 4);
1620         m2 = _mm_loadu_ps(inmatrix16f + 8);
1621         m3 = _mm_loadu_ps(inmatrix16f + 12);
1622         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1623         {
1624                 while (out4f < end)
1625                 {
1626                         __m128 v = _mm_loadu_ps(in4f);
1627                         _mm_store_ps(out4f,
1628                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1629                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1630                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1631                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1632                         out4f += 4;
1633                         in4f += 4;
1634                 }
1635         }
1636         else
1637         {
1638                 while (out4f < end)
1639                 {
1640                         __m128 v = _mm_load_ps(in4f);
1641                         _mm_store_ps(out4f,
1642                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1643                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1644                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1645                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1646                         out4f += 4;
1647                         in4f += 4;
1648                 }
1649         }
1650 #endif
1651 }
1652
1653 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1654 {
1655         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1656 }
1657
1658 #ifdef SSE2_PRESENT
1659 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1660 { \
1661         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1662         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1663         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1664         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1665 }
1666
1667 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1668 { \
1669         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1670         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1671         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1672         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1673 }
1674
1675 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1676 { \
1677         __m128 p = (in); \
1678         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1679                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1680                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1681                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1682 }
1683
1684 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1685 {
1686         int clipmask = 0xFF;
1687         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1688         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1689         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1690         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1691         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1692         #define BBFRONT(k, pos) \
1693         { \
1694                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1695                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1697                 { \
1698                         __m128 proj; \
1699                         clipmask &= ~(1<<k); \
1700                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1701                         minproj = _mm_min_ss(minproj, proj); \
1702                         maxproj = _mm_max_ss(maxproj, proj); \
1703                 } \
1704         }
1705         BBFRONT(0, minpos); 
1706         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1707         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1708         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1709         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1710         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1711         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1712         BBFRONT(7, maxpos);
1713         #define BBCLIP(k) \
1714         { \
1715                 if (clipmask&(1<<k)) \
1716                 { \
1717                         if (!(clipmask&(1<<(k^1)))) \
1718                         { \
1719                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1720                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1721                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1722                                 minproj = _mm_min_ss(minproj, proj); \
1723                                 maxproj = _mm_max_ss(maxproj, proj); \
1724                         } \
1725                         if (!(clipmask&(1<<(k^2)))) \
1726                         { \
1727                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1728                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1729                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1730                                 minproj = _mm_min_ss(minproj, proj); \
1731                                 maxproj = _mm_max_ss(maxproj, proj); \
1732                         } \
1733                         if (!(clipmask&(1<<(k^4)))) \
1734                         { \
1735                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1736                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1737                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1738                                 minproj = _mm_min_ss(minproj, proj); \
1739                                 maxproj = _mm_max_ss(maxproj, proj); \
1740                         } \
1741                 } \
1742         }
1743         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1744         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1745         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1746         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1747         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1748         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1749         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1750         *starty = _mm_cvttss_si32(maxproj);
1751         *endy = _mm_cvttss_si32(minproj)+1;
1752         return clipmask;
1753 }
1754         
1755 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1756 {
1757         float *end = out4f + numitems*4;
1758         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1759         __m128 minpos, maxpos;
1760         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1761         {
1762                 minpos = maxpos = _mm_loadu_ps(in4f);
1763                 while (out4f < end)
1764                 {
1765                         __m128 v = _mm_loadu_ps(in4f);
1766                         minpos = _mm_min_ps(minpos, v);
1767                         maxpos = _mm_max_ps(maxpos, v);
1768                         _mm_store_ps(out4f, v);
1769                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1770                         _mm_store_ps(screen4f, v);
1771                         in4f += 4;
1772                         out4f += 4;
1773                         screen4f += 4;
1774                 }
1775         }
1776         else
1777         {
1778                 minpos = maxpos = _mm_load_ps(in4f);
1779                 while (out4f < end)
1780                 {
1781                         __m128 v = _mm_load_ps(in4f);
1782                         minpos = _mm_min_ps(minpos, v);
1783                         maxpos = _mm_max_ps(maxpos, v);
1784                         _mm_store_ps(out4f, v);
1785                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1786                         _mm_store_ps(screen4f, v);
1787                         in4f += 4;
1788                         out4f += 4;
1789                         screen4f += 4;
1790                 }
1791         }
1792         if (starty && endy) 
1793                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1794                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1795                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1796                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1797                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1798         return 0;
1799 }
1800
1801 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1802 {
1803         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805         float *end;
1806         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808         end = out4f + numitems*4;
1809         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811         m0 = _mm_loadu_ps(inmatrix16f);
1812         m1 = _mm_loadu_ps(inmatrix16f + 4);
1813         m2 = _mm_loadu_ps(inmatrix16f + 8);
1814         m3 = _mm_loadu_ps(inmatrix16f + 12);
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824                         _mm_store_ps(out4f, v);
1825                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826                         _mm_store_ps(screen4f, v);
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832         else
1833         {
1834                 minpos = maxpos = _mm_load_ps(in4f);
1835                 while (out4f < end)
1836                 {
1837                         __m128 v = _mm_load_ps(in4f);
1838                         minpos = _mm_min_ps(minpos, v);
1839                         maxpos = _mm_max_ps(maxpos, v);
1840                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1851         return 0;
1852 }
1853 #endif
1854
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 {
1857 #ifdef SSE2_PRESENT
1858         float *outf = dpsoftrast.post_array4f[outarray];
1859         const unsigned char *inb;
1860         int firstvertex = dpsoftrast.firstvertex;
1861         int numvertices = dpsoftrast.numvertices;
1862         int stride;
1863         switch(inarray)
1864         {
1865         case DPSOFTRAST_ARRAY_POSITION:
1866                 stride = dpsoftrast.stride_vertex;
1867                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1868                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1869                 break;
1870         case DPSOFTRAST_ARRAY_COLOR:
1871                 stride = dpsoftrast.stride_color;
1872                 if (dpsoftrast.pointer_color4f)
1873                 {
1874                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1875                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1876                 }
1877                 else if (dpsoftrast.pointer_color4ub)
1878                 {
1879                         stride = dpsoftrast.stride_color;
1880                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1881                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1882                 }
1883                 else
1884                 {
1885                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1886                 }
1887                 break;
1888         default:
1889                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1890                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1891                 {
1892                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1893                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894                         {
1895                         case 2:
1896                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1897                                 break;
1898                         case 3:
1899                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1900                                 break;
1901                         case 4:
1902                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1903                                 break;
1904                         }
1905                 }
1906                 break;
1907         }
1908         return outf;
1909 #else
1910         return NULL;
1911 #endif
1912 }
1913
1914 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1915 {
1916         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1917         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1918         return data;
1919 }
1920
1921 #if 0
1922 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1923 {
1924 #ifdef SSE2_PRESENT
1925         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1927         return data;
1928 #else
1929         return NULL;
1930 #endif
1931 }
1932 #endif
1933
1934 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1935 {
1936 #ifdef SSE2_PRESENT
1937         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1938         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1939         return data;
1940 #else
1941         return NULL;
1942 #endif
1943 }
1944
1945 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1946 {
1947         int x;
1948         int startx = span->startx;
1949         int endx = span->endx;
1950         float wslope = triangle->w[0];
1951         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1952         float endz = 1.0f / (w + wslope * startx);
1953         for (x = startx;x < endx;)
1954         {
1955                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1956                 float z = endz, dz;
1957                 if (nextsub >= endx) nextsub = endsub = endx-1;
1958                 endz = 1.0f / (w + wslope * nextsub);
1959                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1960                 for (; x <= endsub; x++, z += dz)
1961                         zf[x] = z;
1962         }
1963 }
1964
1965 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1966 {
1967         int x;
1968         int startx = span->startx;
1969         int endx = span->endx;
1970         int d[4];
1971         float a, b;
1972         unsigned char * RESTRICT pixelmask = span->pixelmask;
1973         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1974         if (!pixel)
1975                 return;
1976         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1977         // handle alphatest now (this affects depth writes too)
1978         if (thread->alphatest)
1979                 for (x = startx;x < endx;x++)
1980                         if (in4f[x*4+3] < 0.5f)
1981                                 pixelmask[x] = false;
1982         // FIXME: this does not handle bigendian
1983         switch(thread->fb_blendmode)
1984         {
1985         case DPSOFTRAST_BLENDMODE_OPAQUE:
1986                 for (x = startx;x < endx;x++)
1987                 {
1988                         if (!pixelmask[x])
1989                                 continue;
1990                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1991                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1992                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1993                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1994                         pixel[x*4+0] = d[0];
1995                         pixel[x*4+1] = d[1];
1996                         pixel[x*4+2] = d[2];
1997                         pixel[x*4+3] = d[3];
1998                 }
1999                 break;
2000         case DPSOFTRAST_BLENDMODE_ALPHA:
2001                 for (x = startx;x < endx;x++)
2002                 {
2003                         if (!pixelmask[x])
2004                                 continue;
2005                         a = in4f[x*4+3] * 255.0f;
2006                         b = 1.0f - in4f[x*4+3];
2007                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2008                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2009                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2010                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2011                         pixel[x*4+0] = d[0];
2012                         pixel[x*4+1] = d[1];
2013                         pixel[x*4+2] = d[2];
2014                         pixel[x*4+3] = d[3];
2015                 }
2016                 break;
2017         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2018                 for (x = startx;x < endx;x++)
2019                 {
2020                         if (!pixelmask[x])
2021                                 continue;
2022                         a = in4f[x*4+3] * 255.0f;
2023                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027                         pixel[x*4+0] = d[0];
2028                         pixel[x*4+1] = d[1];
2029                         pixel[x*4+2] = d[2];
2030                         pixel[x*4+3] = d[3];
2031                 }
2032                 break;
2033         case DPSOFTRAST_BLENDMODE_ADD:
2034                 for (x = startx;x < endx;x++)
2035                 {
2036                         if (!pixelmask[x])
2037                                 continue;
2038                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042                         pixel[x*4+0] = d[0];
2043                         pixel[x*4+1] = d[1];
2044                         pixel[x*4+2] = d[2];
2045                         pixel[x*4+3] = d[3];
2046                 }
2047                 break;
2048         case DPSOFTRAST_BLENDMODE_INVMOD:
2049                 for (x = startx;x < endx;x++)
2050                 {
2051                         if (!pixelmask[x])
2052                                 continue;
2053                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_MUL:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072                         pixel[x*4+0] = d[0];
2073                         pixel[x*4+1] = d[1];
2074                         pixel[x*4+2] = d[2];
2075                         pixel[x*4+3] = d[3];
2076                 }
2077                 break;
2078         case DPSOFTRAST_BLENDMODE_MUL2:
2079                 for (x = startx;x < endx;x++)
2080                 {
2081                         if (!pixelmask[x])
2082                                 continue;
2083                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2084                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2085                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2086                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2087                         pixel[x*4+0] = d[0];
2088                         pixel[x*4+1] = d[1];
2089                         pixel[x*4+2] = d[2];
2090                         pixel[x*4+3] = d[3];
2091                 }
2092                 break;
2093         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2094                 for (x = startx;x < endx;x++)
2095                 {
2096                         if (!pixelmask[x])
2097                                 continue;
2098                         a = in4f[x*4+3] * -255.0f;
2099                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2100                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2101                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2102                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2103                         pixel[x*4+0] = d[0];
2104                         pixel[x*4+1] = d[1];
2105                         pixel[x*4+2] = d[2];
2106                         pixel[x*4+3] = d[3];
2107                 }
2108                 break;
2109         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2110                 for (x = startx;x < endx;x++)
2111                 {
2112                         if (!pixelmask[x])
2113                                 continue;
2114                         a = 255.0f;
2115                         b = 1.0f - in4f[x*4+3];
2116                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2117                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2118                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2119                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2120                         pixel[x*4+0] = d[0];
2121                         pixel[x*4+1] = d[1];
2122                         pixel[x*4+2] = d[2];
2123                         pixel[x*4+3] = d[3];
2124                 }
2125                 break;
2126         case DPSOFTRAST_BLENDMODE_INVADD:
2127                 for (x = startx;x < endx;x++)
2128                 {
2129                         if (!pixelmask[x])
2130                                 continue;
2131                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2132                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2133                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2134                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2135                         pixel[x*4+0] = d[0];
2136                         pixel[x*4+1] = d[1];
2137                         pixel[x*4+2] = d[2];
2138                         pixel[x*4+3] = d[3];
2139                 }
2140                 break;
2141         }
2142 }
2143
2144 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2145 {
2146 #ifdef SSE2_PRESENT
2147         int x;
2148         int startx = span->startx;
2149         int endx = span->endx;
2150         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2151         unsigned char * RESTRICT pixelmask = span->pixelmask;
2152         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2153         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2154         if (!pixel)
2155                 return;
2156         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2157         pixeli += span->y * dpsoftrast.fb_width + span->x;
2158         // handle alphatest now (this affects depth writes too)
2159         if (thread->alphatest)
2160                 for (x = startx;x < endx;x++)
2161                         if (in4ub[x*4+3] < 0.5f)
2162                                 pixelmask[x] = false;
2163         // FIXME: this does not handle bigendian
2164         switch(thread->fb_blendmode)
2165         {
2166         case DPSOFTRAST_BLENDMODE_OPAQUE:
2167                 for (x = startx;x + 4 <= endx;)
2168                 {
2169                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2170                         {
2171                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172                                 x += 4;
2173                         }
2174                         else
2175                         {
2176                                 if (pixelmask[x])
2177                                         pixeli[x] = ini[x];
2178                                 x++;
2179                         }
2180                 }
2181                 for (;x < endx;x++)
2182                         if (pixelmask[x])
2183                                 pixeli[x] = ini[x];
2184                 break;
2185         case DPSOFTRAST_BLENDMODE_ALPHA:
2186         #define FINISHBLEND(blend2, blend1) \
2187                 for (x = startx;x + 1 < endx;x += 2) \
2188                 { \
2189                         __m128i src, dst; \
2190                         switch (*(const unsigned short*)&pixelmask[x]) \
2191                         { \
2192                         case 0x0101: \
2193                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2194                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2195                                 blend2; \
2196                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2197                                 continue; \
2198                         case 0x0100: \
2199                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2200                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2201                                 blend1; \
2202                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2203                                 continue; \
2204                         case 0x0001: \
2205                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2207                                 blend1; \
2208                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2209                                 continue; \
2210                         } \
2211                         break; \
2212                 } \
2213                 for(;x < endx; x++) \
2214                 { \
2215                         __m128i src, dst; \
2216                         if (!pixelmask[x]) \
2217                                 continue; \
2218                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                         blend1; \
2221                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                 }
2223
2224                 FINISHBLEND({
2225                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                 }, {
2228                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                 });
2231                 break;
2232         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                 FINISHBLEND({
2234                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                 }, {
2237                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                 });
2240                 break;
2241         case DPSOFTRAST_BLENDMODE_ADD:
2242                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                 break;
2244         case DPSOFTRAST_BLENDMODE_INVMOD:
2245                 FINISHBLEND({
2246                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                 }, {
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_MUL:
2252                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_MUL2:
2255                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                 break;
2257         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                 FINISHBLEND({
2259                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                 }, {
2262                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                 });
2265                 break;
2266         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                 FINISHBLEND({
2268                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                 }, {
2271                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                 });
2274                 break;
2275         case DPSOFTRAST_BLENDMODE_INVADD:
2276                 FINISHBLEND({
2277                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                 }, {
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 });
2281                 break;
2282         }
2283 #endif
2284 }
2285
2286 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2287 {
2288         int x;
2289         int startx = span->startx;
2290         int endx = span->endx;
2291         int flags;
2292         float c[4];
2293         float data[4];
2294         float slope[4];
2295         float tc[2], endtc[2];
2296         float tcscale[2];
2297         unsigned int tci[2];
2298         unsigned int tci1[2];
2299         unsigned int tcimin[2];
2300         unsigned int tcimax[2];
2301         int tciwrapmask[2];
2302         int tciwidth;
2303         int filter;
2304         int mip;
2305         const unsigned char * RESTRICT pixelbase;
2306         const unsigned char * RESTRICT pixel[4];
2307         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2308         // if no texture is bound, just fill it with white
2309         if (!texture)
2310         {
2311                 for (x = startx;x < endx;x++)
2312                 {
2313                         out4f[x*4+0] = 1.0f;
2314                         out4f[x*4+1] = 1.0f;
2315                         out4f[x*4+2] = 1.0f;
2316                         out4f[x*4+3] = 1.0f;
2317                 }
2318                 return;
2319         }
2320         mip = triangle->mip[texunitindex];
2321         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322         // if this mipmap of the texture is 1 pixel, just fill it with that color
2323         if (texture->mipmap[mip][1] == 4)
2324         {
2325                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2326                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2327                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2328                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2329                 for (x = startx;x < endx;x++)
2330                 {
2331                         out4f[x*4+0] = c[0];
2332                         out4f[x*4+1] = c[1];
2333                         out4f[x*4+2] = c[2];
2334                         out4f[x*4+3] = c[3];
2335                 }
2336                 return;
2337         }
2338         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2339         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2340         flags = texture->flags;
2341         tcscale[0] = texture->mipmap[mip][2];
2342         tcscale[1] = texture->mipmap[mip][3];
2343         tciwidth = texture->mipmap[mip][2];
2344         tcimin[0] = 0;
2345         tcimin[1] = 0;
2346         tcimax[0] = texture->mipmap[mip][2]-1;
2347         tcimax[1] = texture->mipmap[mip][3]-1;
2348         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2349         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2350         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2351         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2352         for (x = startx;x < endx;)
2353         {
2354                 unsigned int subtc[2];
2355                 unsigned int substep[2];
2356                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2357                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2358                 if (nextsub >= endx)
2359                 {
2360                         nextsub = endsub = endx-1;      
2361                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2362                 }
2363                 tc[0] = endtc[0];
2364                 tc[1] = endtc[1];
2365                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2366                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2367                 substep[0] = (endtc[0] - tc[0]) * subscale;
2368                 substep[1] = (endtc[1] - tc[1]) * subscale;
2369                 subtc[0] = tc[0] * (1<<16);
2370                 subtc[1] = tc[1] * (1<<16);
2371                 if (filter)
2372                 {
2373                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2374                         {
2375                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2376                                 {
2377                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2378                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2379                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2380                                         tci[0] = subtc[0]>>16;
2381                                         tci[1] = subtc[1]>>16;
2382                                         tci1[0] = tci[0] + 1;
2383                                         tci1[1] = tci[1] + 1;
2384                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2385                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2386                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2387                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2388                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2389                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2390                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2391                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2392                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2393                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2394                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         out4f[x*4+0] = c[0];
2397                                         out4f[x*4+1] = c[1];
2398                                         out4f[x*4+2] = c[2];
2399                                         out4f[x*4+3] = c[3];
2400                                 }
2401                         }
2402                         else
2403                         {
2404                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2405                                 {
2406                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2407                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2408                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2409                                         tci[0] = subtc[0]>>16;
2410                                         tci[1] = subtc[1]>>16;
2411                                         tci1[0] = tci[0] + 1;
2412                                         tci1[1] = tci[1] + 1;
2413                                         tci[0] &= tciwrapmask[0];
2414                                         tci[1] &= tciwrapmask[1];
2415                                         tci1[0] &= tciwrapmask[0];
2416                                         tci1[1] &= tciwrapmask[1];
2417                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2419                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2420                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2421                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2422                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2423                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         out4f[x*4+0] = c[0];
2426                                         out4f[x*4+1] = c[1];
2427                                         out4f[x*4+2] = c[2];
2428                                         out4f[x*4+3] = c[3];
2429                                 }
2430                         }
2431                 }
2432                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2433                 {
2434                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2435                         {
2436                                 tci[0] = subtc[0]>>16;
2437                                 tci[1] = subtc[1]>>16;
2438                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2439                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2440                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2442                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2443                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2444                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2445                                 out4f[x*4+0] = c[0];
2446                                 out4f[x*4+1] = c[1];
2447                                 out4f[x*4+2] = c[2];
2448                                 out4f[x*4+3] = c[3];
2449                         }
2450                 }
2451                 else
2452                 {
2453                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2454                         {
2455                                 tci[0] = subtc[0]>>16;
2456                                 tci[1] = subtc[1]>>16;
2457                                 tci[0] &= tciwrapmask[0];
2458                                 tci[1] &= tciwrapmask[1];
2459                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464                                 out4f[x*4+0] = c[0];
2465                                 out4f[x*4+1] = c[1];
2466                                 out4f[x*4+2] = c[2];
2467                                 out4f[x*4+3] = c[3];
2468                         }
2469                 }
2470         }
2471 }
2472
2473 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2474 {
2475 #ifdef SSE2_PRESENT
2476         int x;
2477         int startx = span->startx;
2478         int endx = span->endx;
2479         int flags;
2480         __m128 data, slope, tcscale;
2481         __m128i tcsize, tcmask, tcoffset, tcmax;
2482         __m128 tc, endtc;
2483         __m128i subtc, substep, endsubtc;
2484         int filter;
2485         int mip;
2486         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2487         const unsigned char * RESTRICT pixelbase;
2488         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2489         // if no texture is bound, just fill it with white
2490         if (!texture)
2491         {
2492                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2493                 return;
2494         }
2495         mip = triangle->mip[texunitindex];
2496         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2497         // if this mipmap of the texture is 1 pixel, just fill it with that color
2498         if (texture->mipmap[mip][1] == 4)
2499         {
2500                 unsigned int k = *((const unsigned int *)pixelbase);
2501                 for (x = startx;x < endx;x++)
2502                         outi[x] = k;
2503                 return;
2504         }
2505         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2506         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2507         flags = texture->flags;
2508         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2509         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2510         tcscale = _mm_cvtepi32_ps(tcsize);
2511         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2512         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2513         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2514         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2515         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2516         tcmax = _mm_packs_epi32(tcmask, tcmask);
2517         for (x = startx;x < endx;)
2518         {
2519                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2520                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2521                 if (nextsub >= endx)
2522                 {
2523                         nextsub = endsub = endx-1;
2524                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2525                 }       
2526                 tc = endtc;
2527                 subtc = endsubtc;
2528                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2529                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2530                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2531                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2532                 substep = _mm_slli_epi32(substep, 1);
2533                 if (filter)
2534                 {
2535                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2536                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2537                         {
2538                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2539                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2540                                 {
2541                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2542                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2543                                         tci = _mm_madd_epi16(tci, tcoffset);
2544                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2545                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2546                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2547                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2548                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2549                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2550                                         fracm = _mm_srli_epi16(subtc, 1);
2551                                         pix1 = _mm_add_epi16(pix1,
2552                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2553                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2554                                         pix3 = _mm_add_epi16(pix3,
2555                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2556                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2557                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2558                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2559                                         pix2 = _mm_add_epi16(pix2,
2560                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2561                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2562                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2563                                 }
2564                                 if (x <= endsub)
2565                                 {
2566                                         const unsigned char * RESTRICT ptr1;
2567                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2568                                         tci = _mm_madd_epi16(tci, tcoffset);
2569                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2570                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2571                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2572                                         fracm = _mm_srli_epi16(subtc, 1);
2573                                         pix1 = _mm_add_epi16(pix1,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2577                                         pix1 = _mm_add_epi16(pix1,
2578                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2580                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2581                                         x++;
2582                                 }
2583                         }
2584                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2585                         {
2586                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2587                                 {
2588                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2589                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590                                         tci = _mm_madd_epi16(tci, tcoffset);
2591                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593                                                                                         _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596                                                                                         _mm_setzero_si128());
2597                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2598                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2599                                         tci = _mm_madd_epi16(tci, tcoffset);
2600                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602                                                                                         _mm_setzero_si128());
2603                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         fracm = _mm_srli_epi16(subtc, 1);
2607                                         pix1 = _mm_add_epi16(pix1,
2608                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2610                                         pix3 = _mm_add_epi16(pix3,
2611                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2612                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2613                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2614                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2615                                         pix2 = _mm_add_epi16(pix2,
2616                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2617                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2618                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2619                                 }
2620                                 if (x <= endsub)
2621                                 {
2622                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2623                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2627                                                                                         _mm_setzero_si128());
2628                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2630                                                                                         _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2636                                         pix1 = _mm_add_epi16(pix1,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2638                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2639                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2640                                         x++;
2641                                 }
2642                         }
2643                         else
2644                         {
2645                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2646                                 {
2647                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2648                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2657                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661                                                                                         _mm_setzero_si128());
2662                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         fracm = _mm_srli_epi16(subtc, 1);
2666                                         pix1 = _mm_add_epi16(pix1,
2667                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2669                                         pix3 = _mm_add_epi16(pix3,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2671                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2672                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2673                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2674                                         pix2 = _mm_add_epi16(pix2,
2675                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2676                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2677                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2678                                 }
2679                                 if (x <= endsub)
2680                                 {
2681                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2682                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2683                                         tci = _mm_madd_epi16(tci, tcoffset);
2684                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         fracm = _mm_srli_epi16(subtc, 1);
2691                                         pix1 = _mm_add_epi16(pix1,
2692                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2693                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2694                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2695                                         pix1 = _mm_add_epi16(pix1,
2696                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2697                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2698                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2699                                         x++;
2700                                 }
2701                         }
2702                 }
2703                 else
2704                 {
2705                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2706                         {
2707                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2708                                 {
2709                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2710                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2714                                 }
2715                                 if (x <= endsub)
2716                                 {
2717                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2718                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2719                                         tci = _mm_madd_epi16(tci, tcoffset);
2720                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2721                                         x++;
2722                                 }
2723                         }
2724                         else
2725                         {
2726                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2727                                 {
2728                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729                                         tci = _mm_and_si128(tci, tcmax); 
2730                                         tci = _mm_madd_epi16(tci, tcoffset);
2731                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2733                                 }
2734                                 if (x <= endsub)
2735                                 {
2736                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737                                         tci = _mm_and_si128(tci, tcmax); 
2738                                         tci = _mm_madd_epi16(tci, tcoffset);
2739                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2740                                         x++;
2741                                 }
2742                         }
2743                 }
2744         }
2745 #endif
2746 }
2747
2748 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2749 {
2750         // TODO: IMPLEMENT
2751         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2752 }
2753
2754 float DPSOFTRAST_SampleShadowmap(const float *vector)
2755 {
2756         // TODO: IMPLEMENT
2757         return 1.0f;
2758 }
2759
2760 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2761 {
2762         int x;
2763         int startx = span->startx;
2764         int endx = span->endx;
2765         float c[4];
2766         float data[4];
2767         float slope[4];
2768         float z;
2769         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2770         for (x = startx;x < endx;x++)
2771         {
2772                 z = zf[x];
2773                 c[0] = (data[0] + slope[0]*x) * z;
2774                 c[1] = (data[1] + slope[1]*x) * z;
2775                 c[2] = (data[2] + slope[2]*x) * z;
2776                 c[3] = (data[3] + slope[3]*x) * z;
2777                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2778                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2779                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2780                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2781         }
2782 }
2783
2784 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2785 {
2786         int x;
2787         int startx = span->startx;
2788         int endx = span->endx;
2789         float c[4];
2790         float data[4];
2791         float slope[4];
2792         float z;
2793         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2794         for (x = startx;x < endx;x++)
2795         {
2796                 z = zf[x];
2797                 c[0] = (data[0] + slope[0]*x) * z;
2798                 c[1] = (data[1] + slope[1]*x) * z;
2799                 c[2] = (data[2] + slope[2]*x) * z;
2800                 c[3] = (data[3] + slope[3]*x) * z;
2801                 out4f[x*4+0] = c[0];
2802                 out4f[x*4+1] = c[1];
2803                 out4f[x*4+2] = c[2];
2804                 out4f[x*4+3] = c[3];
2805         }
2806 }
2807
2808 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2809 {
2810         int x, startx = span->startx, endx = span->endx;
2811         float c[4], localcolor[4];
2812         localcolor[0] = subcolor[0];
2813         localcolor[1] = subcolor[1];
2814         localcolor[2] = subcolor[2];
2815         localcolor[3] = subcolor[3];
2816         for (x = startx;x < endx;x++)
2817         {
2818                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2819                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2820                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2821                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2822                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2823                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2824                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2825                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2826         }
2827 }
2828
2829 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2830 {
2831         int x, startx = span->startx, endx = span->endx;
2832         for (x = startx;x < endx;x++)
2833         {
2834                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2835                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2836                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2837                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2838         }
2839 }
2840
2841 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2842 {
2843         int x, startx = span->startx, endx = span->endx;
2844         for (x = startx;x < endx;x++)
2845         {
2846                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2847                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2848                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2849                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2850         }
2851 }
2852
2853 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2854 {
2855         int x, startx = span->startx, endx = span->endx;
2856         float a, b;
2857         for (x = startx;x < endx;x++)
2858         {
2859                 a = 1.0f - inb4f[x*4+3];
2860                 b = inb4f[x*4+3];
2861                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2862                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2863                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2864                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         float localcolor[4], ilerp, lerp;
2872         localcolor[0] = color[0];
2873         localcolor[1] = color[1];
2874         localcolor[2] = color[2];
2875         localcolor[3] = color[3];
2876         ilerp = 1.0f - localcolor[3];
2877         lerp = localcolor[3];
2878         for (x = startx;x < endx;x++)
2879         {
2880                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2881                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2882                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2883                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2884         }
2885 }
2886
2887
2888
2889 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2890 {
2891 #ifdef SSE2_PRESENT
2892         int x;
2893         int startx = span->startx;
2894         int endx = span->endx;
2895         __m128 data, slope;
2896         __m128 mod, endmod;
2897         __m128i submod, substep, endsubmod;
2898         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2899         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2900         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2901         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2902         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2903         for (x = startx; x < endx;)
2904         {
2905                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2906                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2907                 if (nextsub >= endx)
2908                 {
2909                         nextsub = endsub = endx-1;
2910                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2911                 }
2912                 mod = endmod;
2913                 submod = endsubmod;
2914                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2915                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2916                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2917                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2918                 substep = _mm_packs_epi32(substep, substep);
2919                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2920                 {
2921                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2922                         pix = _mm_mulhi_epu16(pix, submod);
2923                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2924                 }
2925                 if (x <= endsub)
2926                 {
2927                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2928                         pix = _mm_mulhi_epu16(pix, submod);
2929                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2930                         x++;
2931                 }
2932         }
2933 #endif
2934 }
2935
2936 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2937 {
2938 #ifdef SSE2_PRESENT
2939         int x;
2940         int startx = span->startx;
2941         int endx = span->endx;
2942         __m128 data, slope;
2943         __m128 mod, endmod;
2944         __m128i submod, substep, endsubmod;
2945         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2946         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2947         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2948         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2949         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2950         for (x = startx; x < endx;)
2951         {
2952                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2953                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2954                 if (nextsub >= endx)
2955                 {
2956                         nextsub = endsub = endx-1;
2957                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2958                 }
2959                 mod = endmod;
2960                 submod = endsubmod;
2961                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2962                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2963                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2964                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2965                 substep = _mm_packs_epi32(substep, substep);
2966                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2967                 {
2968                         __m128i pix = _mm_srai_epi16(submod, 4);
2969                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2970                 }
2971                 if (x <= endsub)
2972                 {
2973                         __m128i pix = _mm_srai_epi16(submod, 4);
2974                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2975                         x++;
2976                 }
2977         }
2978 #endif
2979 }
2980
2981 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2982 {
2983 #ifdef SSE2_PRESENT
2984         int x, startx = span->startx, endx = span->endx;
2985         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2986         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2987         for (x = startx;x+2 <= endx;x+=2)
2988         {
2989                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2990                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2991                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2992                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2993         }
2994         if (x < endx)
2995         {
2996                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2997                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2998                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2999                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3000         }
3001 #endif
3002 }
3003
3004 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3005 {
3006 #ifdef SSE2_PRESENT
3007         int x, startx = span->startx, endx = span->endx;
3008         for (x = startx;x+2 <= endx;x+=2)
3009         {
3010                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3011                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3012                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3013                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3014         }
3015         if (x < endx)
3016         {
3017                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3018                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3019                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3020                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3021         }
3022 #endif
3023 }
3024
3025 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3026 {
3027 #ifdef SSE2_PRESENT
3028         int x, startx = span->startx, endx = span->endx;
3029         for (x = startx;x+2 <= endx;x+=2)
3030         {
3031                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3032                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3033                 pix1 = _mm_add_epi16(pix1, pix2);
3034                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3035         }
3036         if (x < endx)
3037         {
3038                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3039                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3040                 pix1 = _mm_add_epi16(pix1, pix2);
3041                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3042         }
3043 #endif
3044 }
3045
3046 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3047 {
3048 #ifdef SSE2_PRESENT
3049         int x, startx = span->startx, endx = span->endx;
3050         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3051         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3052         for (x = startx;x+2 <= endx;x+=2)
3053         {
3054                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3056                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3057                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3058         }
3059         if (x < endx)
3060         {
3061                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3063                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3064                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3065         }
3066 #endif
3067 }
3068
3069 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3070 {
3071 #ifdef SSE2_PRESENT
3072         int x, startx = span->startx, endx = span->endx;
3073         for (x = startx;x+2 <= endx;x+=2)
3074         {
3075                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3076                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3077                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3078                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3079                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3080         }
3081         if (x < endx)
3082         {
3083                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3084                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3085                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3086                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3087                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3088         }
3089 #endif
3090 }
3091
3092 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3093 {
3094 #ifdef SSE2_PRESENT
3095         int x, startx = span->startx, endx = span->endx;
3096         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3097         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3098         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3099         for (x = startx;x+2 <= endx;x+=2)
3100         {
3101                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3102                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3103                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3104         }
3105         if (x < endx)
3106         {
3107                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3108                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3109                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3110         }
3111 #endif
3112 }
3113
3114
3115
3116 void DPSOFTRAST_VertexShader_Generic(void)
3117 {
3118         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3119         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3120         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3121         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3122                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3123 }
3124
3125 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3126 {
3127         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3128         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3129         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3132         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3133         {
3134                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3135                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3136                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3137                 {
3138                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3139                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3140                         {
3141                                 // multiply
3142                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3143                         }
3144                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3145                         {
3146                                 // add
3147                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3148                         }
3149                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3150                         {
3151                                 // alphablend
3152                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153                         }
3154                 }
3155         }
3156         else
3157                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3158         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3159 }
3160
3161
3162
3163 void DPSOFTRAST_VertexShader_PostProcess(void)
3164 {
3165         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3166         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3167         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3168 }
3169
3170 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3171 {
3172         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3173         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3174         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3177         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3178         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3179         {
3180                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3181                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3182         }
3183         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3184         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3185         {
3186                 // TODO: implement saturation
3187         }
3188         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3189         {
3190                 // TODO: implement gammaramps
3191         }
3192         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3193 }
3194
3195
3196
3197 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3198 {
3199         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3200 }
3201
3202 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3203 {
3204         // this is never called (because colormask is off when this shader is used)
3205         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3206         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3208         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3209         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3210 }
3211
3212
3213
3214 void DPSOFTRAST_VertexShader_FlatColor(void)
3215 {
3216         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3218 }
3219
3220 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3221 {
3222 #ifdef SSE2_PRESENT
3223         unsigned char * RESTRICT pixelmask = span->pixelmask;
3224         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3225         int x, startx = span->startx, endx = span->endx;
3226         __m128i Color_Ambientm;
3227         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3228         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233                 pixel = buffer_FragColorbgra8;
3234         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238         for (x = startx;x < endx;x++)
3239         {
3240                 __m128i color, pix;
3241                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3242                 {
3243                         __m128i pix2;
3244                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3245                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3246                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3247                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3248                         x += 3;
3249                         continue;
3250                 }
3251                 if (!pixelmask[x])
3252                         continue;
3253                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3254                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3255                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3256         }
3257         if (pixel == buffer_FragColorbgra8)
3258                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3259 #endif
3260 }
3261
3262
3263
3264 void DPSOFTRAST_VertexShader_VertexColor(void)
3265 {
3266         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3268         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3269 }
3270
3271 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3272 {
3273 #ifdef SSE2_PRESENT
3274         unsigned char * RESTRICT pixelmask = span->pixelmask;
3275         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3276         int x, startx = span->startx, endx = span->endx;
3277         __m128i Color_Ambientm, Color_Diffusem;
3278         __m128 data, slope;
3279         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3280         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3281         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3283         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286                 pixel = buffer_FragColorbgra8;
3287         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3294         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3295         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3296         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3297         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3298         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3299         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3300         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3301         {
3302                 __m128i color, mod, pix;
3303                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3304                 {
3305                         __m128i pix2, mod2;
3306                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3307                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3308                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3309                         data = _mm_add_ps(data, slope);
3310                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3311                         data = _mm_add_ps(data, slope);
3312                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3313                         data = _mm_add_ps(data, slope);
3314                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3315                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3316                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3317                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3318                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3319                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3320                         x += 3;
3321                         continue;
3322                 }
3323                 if (!pixelmask[x])
3324                         continue;
3325                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3326                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3327                 mod = _mm_packs_epi32(mod, mod);
3328                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3329                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3330         }
3331         if (pixel == buffer_FragColorbgra8)
3332                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3333 #endif
3334 }
3335
3336
3337
3338 void DPSOFTRAST_VertexShader_Lightmap(void)
3339 {
3340         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3341         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3342         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3343 }
3344
3345 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3346 {
3347 #ifdef SSE2_PRESENT
3348         unsigned char * RESTRICT pixelmask = span->pixelmask;
3349         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3350         int x, startx = span->startx, endx = span->endx;
3351         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3352         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3355         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3359         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3360         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3361                 pixel = buffer_FragColorbgra8;
3362         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3363         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3364         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3365         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3366         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3369         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3370         {
3371                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3372                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3375                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3376                 for (x = startx;x < endx;x++)
3377                 {
3378                         __m128i color, lightmap, glow, pix;
3379                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3380                         {
3381                                 __m128i pix2;
3382                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3383                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3384                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3385                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3386                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3387                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3388                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3389                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3390                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3391                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3392                                 x += 3;
3393                                 continue;
3394                         }
3395                         if (!pixelmask[x])
3396                                 continue;
3397                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3398                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3399                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3400                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3401                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3402                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3403                 }
3404         }
3405         else
3406         {
3407                 for (x = startx;x < endx;x++)
3408                 {
3409                         __m128i color, lightmap, pix;
3410                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3411                         {
3412                                 __m128i pix2;
3413                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3414                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3415                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3416                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3417                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3419                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3420                                 x += 3;
3421                                 continue;
3422                         }
3423                         if (!pixelmask[x]) 
3424                                 continue;
3425                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3426                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3427                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3428                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3429                 }
3430         }
3431         if (pixel == buffer_FragColorbgra8)
3432                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3433 #endif
3434 }
3435
3436
3437
3438 void DPSOFTRAST_VertexShader_FakeLight(void)
3439 {
3440         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3441 }
3442
3443 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3444 {
3445         // TODO: IMPLEMENT
3446         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3447         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3448         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3449         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3450         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3451 }
3452
3453
3454
3455 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3456 {
3457         DPSOFTRAST_VertexShader_Lightmap();
3458 }
3459
3460 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3461 {
3462         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3463         // TODO: IMPLEMENT
3464 }
3465
3466
3467
3468 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3469 {
3470         DPSOFTRAST_VertexShader_Lightmap();
3471 }
3472
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3474 {
3475         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3476         // TODO: IMPLEMENT
3477 }
3478
3479
3480
3481 void DPSOFTRAST_VertexShader_LightDirection(void)
3482 {
3483         int i;
3484         int numvertices = dpsoftrast.numvertices;
3485         float LightDir[4];
3486         float LightVector[4];
3487         float EyePosition[4];
3488         float EyeVectorModelSpace[4];
3489         float EyeVector[4];
3490         float position[4];
3491         float svector[4];
3492         float tvector[4];
3493         float normal[4];
3494         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3495         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3496         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3497         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3498         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3499         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3500         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3501         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3502         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3503         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3504         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3505         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3506         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3507         for (i = 0;i < numvertices;i++)
3508         {
3509                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3510                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3511                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3512                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3513                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3514                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3515                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3516                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3517                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3518                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3519                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3520                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3521                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3522                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3523                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3524                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3525                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3526                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3527                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3528                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3529                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3530                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3531                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3532                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3533                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3534                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3535                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3538         }
3539         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3540 }
3541
3542 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3543 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3544 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3545 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3546 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3547 #define DPSOFTRAST_Vector3Normalize(v)\
3548 do\
3549 {\
3550         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3551         if (len)\
3552         {\
3553                 len = 1.0f / len;\
3554                 v[0] *= len;\
3555                 v[1] *= len;\
3556                 v[2] *= len;\
3557         }\
3558 }\
3559 while(0)
3560
3561 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3562 {
3563         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3564         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3566         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571         int x, startx = span->startx, endx = span->endx;
3572         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3573         float LightVectordata[4];
3574         float LightVectorslope[4];
3575         float EyeVectordata[4];
3576         float EyeVectorslope[4];
3577         float z;
3578         float diffusetex[4];
3579         float glosstex[4];
3580         float surfacenormal[4];
3581         float lightnormal[4];
3582         float eyenormal[4];
3583         float specularnormal[4];
3584         float diffuse;
3585         float specular;
3586         float SpecularPower;
3587         int d[4];
3588         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3589         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3590         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3591         Color_Glow[3] = 0.0f;
3592         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3593         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3594         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3595         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3596         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3597         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3598         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3599         Color_Pants[3] = 0.0f;
3600         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3601         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3602         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3603         Color_Shirt[3] = 0.0f;
3604         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3605         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3606         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3607         {
3608                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3610         }
3611         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3612         {
3613                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3614         }
3615         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3616         {
3617                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3618                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3619                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3620                 Color_Diffuse[3] = 0.0f;
3621                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3622                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3623                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3624                 LightColor[3] = 0.0f;
3625                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630                 Color_Specular[3] = 0.0f;
3631                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3633                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634                 for (x = startx;x < endx;x++)
3635                 {
3636                         z = buffer_z[x];
3637                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3638                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3639                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3640                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3641                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3642                         {
3643                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3644                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3645                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3646                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3647                         }
3648                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3649                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3650                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3651                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3652                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3653                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3654                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3655                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3656
3657                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3658                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3659                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3660                         DPSOFTRAST_Vector3Normalize(lightnormal);
3661
3662                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3663                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3664                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3665                         DPSOFTRAST_Vector3Normalize(eyenormal);
3666
3667                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3668                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3669                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3670                         DPSOFTRAST_Vector3Normalize(specularnormal);
3671
3672                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3673                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3674                         specular = pow(specular, SpecularPower * glosstex[3]);
3675                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3676                         {
3677                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3678                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3679                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3680                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3681                         }
3682                         else
3683                         {
3684                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3685                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3686                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3687                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3688                         }
3689                         buffer_FragColorbgra8[x*4+0] = d[0];
3690                         buffer_FragColorbgra8[x*4+1] = d[1];
3691                         buffer_FragColorbgra8[x*4+2] = d[2];
3692                         buffer_FragColorbgra8[x*4+3] = d[3];
3693                 }
3694         }
3695         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3696         {
3697                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700                 Color_Diffuse[3] = 0.0f;
3701                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3702                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3703                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3704                 LightColor[3] = 0.0f;
3705                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3706                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3707                 for (x = startx;x < endx;x++)
3708                 {
3709                         z = buffer_z[x];
3710                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3711                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3712                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3713                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3714                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3717                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3718
3719                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3720                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3721                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3722                         DPSOFTRAST_Vector3Normalize(lightnormal);
3723
3724                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3725                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3726                         {
3727                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3728                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3729                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3730                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3731                         }
3732                         else
3733                         {
3734                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3735                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3736                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3737                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3738                         }
3739                         buffer_FragColorbgra8[x*4+0] = d[0];
3740                         buffer_FragColorbgra8[x*4+1] = d[1];
3741                         buffer_FragColorbgra8[x*4+2] = d[2];
3742                         buffer_FragColorbgra8[x*4+3] = d[3];
3743                 }
3744         }
3745         else
3746         {
3747                 for (x = startx;x < endx;x++)
3748                 {
3749                         z = buffer_z[x];
3750                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3751                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3752                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3753                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3754
3755                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3756                         {
3757                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3758                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3759                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3760                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3761                         }
3762                         else
3763                         {
3764                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3765                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3766                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3767                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3768                         }
3769                         buffer_FragColorbgra8[x*4+0] = d[0];
3770                         buffer_FragColorbgra8[x*4+1] = d[1];
3771                         buffer_FragColorbgra8[x*4+2] = d[2];
3772                         buffer_FragColorbgra8[x*4+3] = d[3];
3773                 }
3774         }
3775         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3776 }
3777
3778
3779
3780 void DPSOFTRAST_VertexShader_LightSource(void)
3781 {
3782         int i;
3783         int numvertices = dpsoftrast.numvertices;
3784         float LightPosition[4];
3785         float LightVector[4];
3786         float LightVectorModelSpace[4];
3787         float EyePosition[4];
3788         float EyeVectorModelSpace[4];
3789         float EyeVector[4];
3790         float position[4];
3791         float svector[4];
3792         float tvector[4];
3793         float normal[4];
3794         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3795         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3796         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3797         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3798         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3799         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3800         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3801         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3802         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3803         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3804         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3805         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3806         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3807         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3808         for (i = 0;i < numvertices;i++)
3809         {
3810                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3811                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3812                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3813                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3814                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3815                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3816                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3817                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3818                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3819                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3820                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3821                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3822                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3823                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3824                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3825                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3826                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3827                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3828                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3829                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3830                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3831                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3832                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3833                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3834                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3835                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3836                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3837                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3838                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3839                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3840                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3841                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3842         }
3843         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3844         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3845 }
3846
3847 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3848 {
3849 #ifdef SSE2_PRESENT
3850         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3851         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3852         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3853         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858         int x, startx = span->startx, endx = span->endx;
3859         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3860         float CubeVectordata[4];
3861         float CubeVectorslope[4];
3862         float LightVectordata[4];
3863         float LightVectorslope[4];
3864         float EyeVectordata[4];
3865         float EyeVectorslope[4];
3866         float z;
3867         float diffusetex[4];
3868         float glosstex[4];
3869         float surfacenormal[4];
3870         float lightnormal[4];
3871         float eyenormal[4];
3872         float specularnormal[4];
3873         float diffuse;
3874         float specular;
3875         float SpecularPower;
3876         float CubeVector[4];
3877         float attenuation;
3878         int d[4];
3879         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3880         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3881         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3882         Color_Glow[3] = 0.0f;
3883         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3884         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3885         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3886         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3887         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890         Color_Diffuse[3] = 0.0f;
3891         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3892         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3893         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3894         Color_Specular[3] = 0.0f;
3895         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3896         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3897         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3898         Color_Pants[3] = 0.0f;
3899         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3900         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3901         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3902         Color_Shirt[3] = 0.0f;
3903         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3904         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3905         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3906         LightColor[3] = 0.0f;
3907         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3908         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3909         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3910         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3911         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3912         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3913         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3914         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3915         {
3916                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3918         }
3919         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3920                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3921         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3922         {
3923                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925                 for (x = startx;x < endx;x++)
3926                 {
3927                         z = buffer_z[x];
3928                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932                         if (attenuation < 0.01f)
3933                                 continue;
3934                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3935                         {
3936                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937                                 if (attenuation < 0.01f)
3938                                         continue;
3939                         }
3940
3941                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3946                         {
3947                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3951                         }
3952                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3953                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3954                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3955                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3956                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3960
3961                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3962                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3963                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3964                         DPSOFTRAST_Vector3Normalize(lightnormal);
3965
3966                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3967                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3968                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3969                         DPSOFTRAST_Vector3Normalize(eyenormal);
3970
3971                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3972                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3973                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3974                         DPSOFTRAST_Vector3Normalize(specularnormal);
3975
3976                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3978                         specular = pow(specular, SpecularPower * glosstex[3]);
3979                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3980                         {
3981                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3982                                 attenuation *= (1.0f / 255.0f);
3983                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3984                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3985                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3986                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3987                         }
3988                         else
3989                         {
3990                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3991                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3992                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3993                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3994                         }
3995                         buffer_FragColorbgra8[x*4+0] = d[0];
3996                         buffer_FragColorbgra8[x*4+1] = d[1];
3997                         buffer_FragColorbgra8[x*4+2] = d[2];
3998                         buffer_FragColorbgra8[x*4+3] = d[3];
3999                 }
4000         }
4001         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4002         {
4003                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4004                 for (x = startx;x < endx;x++)
4005                 {
4006                         z = buffer_z[x];
4007                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4008                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4009                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4010                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4011                         if (attenuation < 0.01f)
4012                                 continue;
4013                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4014                         {
4015                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4016                                 if (attenuation < 0.01f)
4017                                         continue;
4018                         }
4019
4020                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4021                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4022                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4023                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4025                         {
4026                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4027                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4028                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4029                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4030                         }
4031                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4032                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4033                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4034                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4035
4036                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4037                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4038                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4039                         DPSOFTRAST_Vector3Normalize(lightnormal);
4040
4041                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4042                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4043                         {
4044                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4045                                 attenuation *= (1.0f / 255.0f);
4046                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4047                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4048                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4049                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4050                         }
4051                         else
4052                         {
4053                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4054                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4055                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4056                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4057                         }
4058                         buffer_FragColorbgra8[x*4+0] = d[0];
4059                         buffer_FragColorbgra8[x*4+1] = d[1];
4060                         buffer_FragColorbgra8[x*4+2] = d[2];
4061                         buffer_FragColorbgra8[x*4+3] = d[3];
4062                 }
4063         }
4064         else
4065         {
4066                 for (x = startx;x < endx;x++)
4067                 {
4068                         z = buffer_z[x];
4069                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4070                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4071                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4072                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4073                         if (attenuation < 0.01f)
4074                                 continue;
4075                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4076                         {
4077                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4078                                 if (attenuation < 0.01f)
4079                                         continue;
4080                         }
4081
4082                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4083                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4084                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4085                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4086                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4087                         {
4088                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4089                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4090                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4091                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4092                         }
4093                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4094                         {
4095                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096                                 attenuation *= (1.0f / 255.0f);
4097                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4101                         }
4102                         else
4103                         {
4104                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4105                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4106                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4107                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4108                         }
4109                         buffer_FragColorbgra8[x*4+0] = d[0];
4110                         buffer_FragColorbgra8[x*4+1] = d[1];
4111                         buffer_FragColorbgra8[x*4+2] = d[2];
4112                         buffer_FragColorbgra8[x*4+3] = d[3];
4113                 }
4114         }
4115         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4116 #endif
4117 }
4118
4119
4120
4121 void DPSOFTRAST_VertexShader_Refraction(void)
4122 {
4123         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4124 }
4125
4126 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4127 {
4128         // TODO: IMPLEMENT
4129         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4134 }
4135
4136
4137
4138 void DPSOFTRAST_VertexShader_Water(void)
4139 {
4140         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4141 }
4142
4143
4144 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4145 {
4146         // TODO: IMPLEMENT
4147         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4148         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4150         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4151         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4152 }
4153
4154
4155
4156 void DPSOFTRAST_VertexShader_ShowDepth(void)
4157 {
4158         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4159 }
4160
4161 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4162 {
4163         // TODO: IMPLEMENT
4164         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4165         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4166         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4167         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4168         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4169 }
4170
4171
4172
4173 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4174 {
4175         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4176 }
4177
4178 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4179 {
4180         // TODO: IMPLEMENT
4181         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4182         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4183         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4184         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4185         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4186 }
4187
4188
4189
4190 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4191 {
4192         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4193 }
4194
4195 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4196 {
4197         // TODO: IMPLEMENT
4198         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4202         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4203 }
4204
4205
4206
4207 void DPSOFTRAST_VertexShader_DeferredBounceLight(void)
4208 {
4209         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4210 }
4211
4212 void DPSOFTRAST_PixelShader_DeferredBounceLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4213 {
4214         // TODO: IMPLEMENT
4215         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4216         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4217         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4218         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4219         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4220 }
4221
4222
4223
4224 typedef struct DPSOFTRAST_ShaderModeInfo_s
4225 {
4226         int lodarrayindex;
4227         void (*Vertex)(void);
4228         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4229         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4230         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4231 }
4232 DPSOFTRAST_ShaderModeInfo;
4233
4234 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4235 {
4236         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4237         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4238         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4239         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4240         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4241         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4242         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4243         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4244         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4245         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4246         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4247         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4248         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4249         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4250         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4251         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4252         {2, DPSOFTRAST_VertexShader_DeferredBounceLight,        DPSOFTRAST_PixelShader_DeferredBounceLight,        {~0}}
4253 };
4254
4255 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4256 {
4257         int i;
4258         int x;
4259         int startx;
4260         int endx;
4261 //      unsigned int c;
4262 //      unsigned int *colorpixel;
4263         unsigned int *depthpixel;
4264         float w;
4265         float wslope;
4266         int depth;
4267         int depthslope;
4268         unsigned int d;
4269         DPSOFTRAST_State_Triangle *triangle;
4270         DPSOFTRAST_State_Span *span;
4271         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4272         for (i = 0; i < thread->numspans; i++)
4273         {
4274                 span = &thread->spans[i];
4275                 triangle = &thread->triangles[span->triangle];
4276                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4277                 {
4278                         wslope = triangle->w[0];
4279                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4280                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4281                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4282                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4283                         startx = span->startx;
4284                         endx = span->endx;
4285                         switch(thread->fb_depthfunc)
4286                         {
4287                         default:
4288                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4289                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4290                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4291                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4292                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4293                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4294                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4295                         }
4296                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4297                         //for (x = startx;x < endx;x++)
4298                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4299                         // if there is no color buffer, skip pixel shader
4300                         while (startx < endx && !pixelmask[startx])
4301                                 startx++;
4302                         while (endx > startx && !pixelmask[endx-1])
4303                                 endx--;
4304                         if (startx >= endx)
4305                                 continue; // no pixels to fill
4306                         span->pixelmask = pixelmask;
4307                         span->startx = startx;
4308                         span->endx = endx;
4309                         // run pixel shader if appropriate
4310                         // do this before running depthmask code, to allow the pixelshader
4311                         // to clear pixelmask values for alpha testing
4312                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4313                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4314                         if (thread->depthmask)
4315                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4316                                         if (pixelmask[x])
4317                                                 depthpixel[x] = d;
4318                 }
4319                 else
4320                 {
4321                         // no depth testing means we're just dealing with color...
4322                         // if there is no color buffer, skip pixel shader
4323                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4324                         {
4325                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4326                                 span->pixelmask = pixelmask;
4327                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4328                         }
4329                 }
4330         }
4331         thread->numspans = 0;
4332 }
4333
4334 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4335
4336 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4337 {
4338 #ifdef SSE2_PRESENT
4339         int cullface = thread->cullface;
4340         int minx, maxx, miny, maxy;
4341         int miny1, maxy1, miny2, maxy2;
4342         __m128i fbmin, fbmax;
4343         __m128 viewportcenter, viewportscale;
4344         int firstvertex = command->firstvertex;
4345         int numvertices = command->numvertices;
4346         int numtriangles = command->numtriangles;
4347         const int *element3i = command->element3i;
4348         const unsigned short *element3s = command->element3s;
4349         int clipped = command->clipped;
4350         int i;
4351         int j;
4352         int k;
4353         int y;
4354         int e[3];
4355         __m128i screeny;
4356         int starty, endy, bandy;
4357         int numpoints;
4358         int clipcase;
4359         float clipdist[4];
4360         __m128 triangleedge1, triangleedge2, trianglenormal;
4361         __m128 clipfrac[3];
4362         __m128 screen[4];
4363         DPSOFTRAST_State_Triangle *triangle;
4364         DPSOFTRAST_Texture *texture;
4365         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4366         miny = thread->fb_scissor[1];
4367         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4368         miny1 = bound(miny, thread->miny1, maxy);
4369         maxy1 = bound(miny, thread->maxy1, maxy);
4370         miny2 = bound(miny, thread->miny2, maxy);
4371         maxy2 = bound(miny, thread->maxy2, maxy);
4372         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4373         {
4374                 if (!ATOMIC_DECREMENT(command->refcount))
4375                 {
4376                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4377                                 MM_FREE(command->arrays);
4378                 }
4379                 return;
4380         }
4381         minx = thread->fb_scissor[0];
4382         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4383         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4384         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4385         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4386         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4387         screen[3] = _mm_setzero_ps();
4388         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4389         for (i = 0;i < numtriangles;i++)
4390         {
4391                 const float *screencoord4f = command->arrays;
4392                 const float *arrays = screencoord4f + numvertices*4;
4393
4394                 // generate the 3 edges of this triangle
4395                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4396                 if (element3s)
4397                 {
4398                         e[0] = element3s[i*3+0] - firstvertex;
4399                         e[1] = element3s[i*3+1] - firstvertex;
4400                         e[2] = element3s[i*3+2] - firstvertex;
4401                 }
4402                 else if (element3i)
4403                 {
4404                         e[0] = element3i[i*3+0] - firstvertex;
4405                         e[1] = element3i[i*3+1] - firstvertex;
4406                         e[2] = element3i[i*3+2] - firstvertex;
4407                 }
4408                 else
4409                 {
4410                         e[0] = i*3+0;
4411                         e[1] = i*3+1;
4412                         e[2] = i*3+2;
4413                 }
4414
4415 #define SKIPBACKFACE \
4416                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4417                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4418                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4419                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4420                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4421                 switch(cullface) \
4422                 { \
4423                 case GL_BACK: \
4424                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4425                                 continue; \
4426                         break; \
4427                 case GL_FRONT: \
4428                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4429                                 continue; \
4430                         break; \
4431                 }
4432
4433 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4434                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4435                         { \
4436                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4437                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4438                         }
4439 #define CLIPPEDVERTEXCOPY(k,p1) \
4440                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4441
4442 #define GENATTRIBCOPY(attrib, p1) \
4443                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4444 #define GENATTRIBLERP(attrib, p1, p2) \
4445                 { \
4446                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4447                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4448                 }
4449 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4450                 switch(clipcase) \
4451                 { \
4452                 default: \
4453                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4454                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4455                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4456                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4457                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4458                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4459                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4460                 }
4461
4462                 if (! clipped)
4463                         goto notclipped;
4464
4465                 // calculate distance from nearplane
4466                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4467                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4468                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4469                 if (clipdist[0] >= 0.0f)
4470                 {
4471                         if (clipdist[1] >= 0.0f)
4472                         {
4473                                 if (clipdist[2] >= 0.0f)
4474                                 {
4475                                 notclipped:
4476                                         // triangle is entirely in front of nearplane
4477                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4478                                         SKIPBACKFACE;
4479                                         numpoints = 3;
4480                                         clipcase = 0;
4481                                 }
4482                                 else
4483                                 {
4484                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4485                                         SKIPBACKFACE;
4486                                         numpoints = 4;
4487                                         clipcase = 1;
4488                                 }
4489                         }
4490                         else
4491                         {
4492                                 if (clipdist[2] >= 0.0f)
4493                                 {
4494                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4495                                         SKIPBACKFACE;
4496                                         numpoints = 4;
4497                                         clipcase = 2;
4498                                 }
4499                                 else
4500                                 {
4501                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4502                                         SKIPBACKFACE;
4503                                         numpoints = 3;
4504                                         clipcase = 3;
4505                                 }
4506                         }
4507                 }
4508                 else if (clipdist[1] >= 0.0f)
4509                 {
4510                         if (clipdist[2] >= 0.0f)
4511                         {
4512                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4513                                 SKIPBACKFACE;
4514                                 numpoints = 4;
4515                                 clipcase = 4;
4516                         }
4517                         else
4518                         {
4519                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4520                                 SKIPBACKFACE;
4521                                 numpoints = 3;
4522                                 clipcase = 5;
4523                         }
4524                 }
4525                 else if (clipdist[2] >= 0.0f)
4526                 {
4527                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4528                         SKIPBACKFACE;
4529                         numpoints = 3;
4530                         clipcase = 6;
4531                 }
4532                 else continue; // triangle is entirely behind nearplane
4533
4534                 {
4535                         // calculate integer y coords for triangle points
4536                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4537                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4538                                         screenmin = _mm_min_epi16(screeni, screenir),
4539                                         screenmax = _mm_max_epi16(screeni, screenir);
4540                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4541                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4542                         screenmin = _mm_max_epi16(screenmin, fbmin);
4543                         screenmax = _mm_min_epi16(screenmax, fbmax);
4544                         // skip offscreen triangles
4545                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4546                                 continue;
4547                         starty = _mm_extract_epi16(screenmin, 1);
4548                         endy = _mm_extract_epi16(screenmax, 1)+1;
4549                         if (starty >= maxy1 && endy <= miny2)
4550                                 continue;
4551                         screeny = _mm_srai_epi32(screeni, 16);
4552                 }
4553
4554                 triangle = &thread->triangles[thread->numtriangles];
4555
4556                 // calculate attribute plans for triangle data...
4557                 // okay, this triangle is going to produce spans, we'd better project
4558                 // the interpolants now (this is what gives perspective texturing),
4559                 // this consists of simply multiplying all arrays by the W coord
4560                 // (which is basically 1/Z), which will be undone per-pixel
4561                 // (multiplying by Z again) to get the perspective-correct array
4562                 // values
4563                 {
4564                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4565                         __m128 mipedgescale, mipdensity;
4566                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4567                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4568                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4569                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4570                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4571                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4572                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4573                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4574                         attribedge1 = _mm_sub_ss(w0, w1);
4575                         attribedge2 = _mm_sub_ss(w2, w1);
4576                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4577                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4578                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4579                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4580                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4581                         _mm_store_ss(&triangle->w[0], attribxslope);
4582                         _mm_store_ss(&triangle->w[1], attribyslope);
4583                         _mm_store_ss(&triangle->w[2], attriborigin);
4584                         mipedgescale = _mm_setzero_ps();
4585                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4586                         {
4587                                 __m128 attrib0, attrib1, attrib2;
4588                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4589                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4590                                         break;
4591                                 arrays += numvertices*4;
4592                                 GENATTRIBS(attrib0, attrib1, attrib2);
4593                                 attriborigin = _mm_mul_ps(attrib1, w1);
4594                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4595                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4596                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4597                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4598                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4599                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4600                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4601                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4602                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4603                                 {
4604                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4605                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4606                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4607                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4608                                 }
4609                         }
4610
4611                         memset(triangle->mip, 0, sizeof(triangle->mip));
4612                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4613                         {
4614                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4615                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4616                                         break;
4617                                 texture = thread->texbound[texunit];
4618                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4619                                 {
4620                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4621                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4622                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4623                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4624                                         // this will be multiplied in the texturing routine by the texture resolution
4625                                         y = _mm_cvtss_si32(mipdensity);
4626                                         if (y > 0)
4627                                         {
4628                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4629                                                 if (y > texture->mipmaps - 1)
4630                                                         y = texture->mipmaps - 1;
4631                                                 triangle->mip[texunit] = y;
4632                                         }
4633                                 }
4634                         }
4635                 }
4636         
4637                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4638                 for (; y < bandy;)
4639                 {
4640                         __m128 xcoords, xslope;
4641                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4642                         int yccmask = _mm_movemask_epi8(ycc);
4643                         int edge0p, edge0n, edge1p, edge1n;
4644                         int nexty;
4645                         if (numpoints == 4)
4646                         {
4647                                 switch(yccmask)
4648                                 {
4649                                 default:
4650                                 case 0xFFFF: /*0000*/ y = endy; continue;
4651                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4652                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4653                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4654                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4655                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4656                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4657                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4658                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4659                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4660                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4661                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4662                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4663                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4664                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4665                                 case 0x0000: /*1111*/ y++; continue;
4666                                 }
4667                         }
4668                         else
4669                         {
4670                                 switch(yccmask)
4671                                 {
4672                                 default:
4673                                 case 0xFFFF: /*000*/ y = endy; continue;
4674                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4675                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4676                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4677                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4678                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4679                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4680                                 case 0x0000: /*111*/ y++; continue;
4681                                 }
4682                         }
4683                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4684                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4685                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4686                         nexty = _mm_extract_epi16(ycc, 0);
4687                         if (nexty >= bandy) nexty = bandy-1;
4688                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4689                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4690                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4691                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4692                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4693                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4694                         {
4695                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4696                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4697                         }
4698                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4699                         {
4700                                 int startx, endx, offset;
4701                                 startx = _mm_cvtss_si32(xcoords);
4702                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4703                                 if (startx < minx) 
4704                                 {
4705                                         if (startx < 0) startx = 0;
4706                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4707                                 }
4708                                 if (endx > maxx) endx = maxx;
4709                                 if (startx >= endx) continue;
4710                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4711                                 {
4712                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4713                                         span->triangle = thread->numtriangles;
4714                                         span->x = offset;
4715                                         span->y = y;
4716                                         span->startx = max(minx - offset, 0);
4717                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4718                                         if (span->startx >= span->endx)
4719                                                 continue; 
4720                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4721                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4722                                 }
4723                         }
4724                 }
4725
4726                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4727                 {
4728                         DPSOFTRAST_Draw_ProcessSpans(thread);
4729                         thread->numtriangles = 0;
4730                 }
4731         }
4732
4733         if (!ATOMIC_DECREMENT(command->refcount))
4734         {
4735                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4736                         MM_FREE(command->arrays);
4737         }
4738
4739         if (thread->numspans > 0 || thread->numtriangles > 0)
4740         {
4741                 DPSOFTRAST_Draw_ProcessSpans(thread);
4742                 thread->numtriangles = 0;
4743         }
4744 #endif
4745 }
4746
4747 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4748 {
4749         int i;
4750         int j;
4751         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4752         int datasize = 2*numvertices*sizeof(float[4]);
4753         DPSOFTRAST_Command_Draw *command;
4754         unsigned char *data;
4755         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4756         {
4757                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4758                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4759                         break;
4760                 datasize += numvertices*sizeof(float[4]);
4761         }
4762         if (element3s)
4763                 datasize += numtriangles*sizeof(unsigned short[3]);
4764         else if (element3i)
4765                 datasize += numtriangles*sizeof(int[3]);
4766         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4767         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4768         {
4769                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4770                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4771         }
4772         else
4773         {
4774                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4775                 data = (unsigned char *)command + commandsize;
4776         }
4777         command->firstvertex = firstvertex;
4778         command->numvertices = numvertices;
4779         command->numtriangles = numtriangles;
4780         command->arrays = (float *)data;
4781         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4782         dpsoftrast.firstvertex = firstvertex;
4783         dpsoftrast.numvertices = numvertices;
4784         dpsoftrast.screencoord4f = (float *)data;
4785         data += numvertices*sizeof(float[4]);
4786         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4787         data += numvertices*sizeof(float[4]);
4788         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4789         {
4790                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4791                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4792                         break;
4793                 dpsoftrast.post_array4f[j] = (float *)data;
4794                 data += numvertices*sizeof(float[4]);
4795         }
4796         command->element3i = NULL;
4797         command->element3s = NULL;
4798         if (element3s)
4799         {
4800                 command->element3s = (unsigned short *)data;
4801                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4802         }
4803         else if (element3i)
4804         {
4805                 command->element3i = (int *)data;
4806                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4807         }
4808         return command;
4809 }
4810
4811 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4812 {
4813         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4814         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4815         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4816         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4817         if (command->starty >= command->endy)
4818         {
4819                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4820                         MM_FREE(command->arrays);
4821                 DPSOFTRAST_UndoCommand(command->commandsize);
4822                 return;
4823         }
4824         command->clipped = dpsoftrast.drawclipped;
4825         command->refcount = dpsoftrast.numthreads;
4826
4827         if (dpsoftrast.usethreads)
4828         {
4829                 int i;
4830                 DPSOFTRAST_Draw_SyncCommands();
4831                 for (i = 0; i < dpsoftrast.numthreads; i++)
4832                 {
4833                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4834                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4835                                 Thread_CondSignal(thread->drawcond);
4836                 }
4837         }
4838         else
4839         {
4840                 DPSOFTRAST_Draw_FlushThreads();
4841         }
4842 }
4843  
4844 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4845 {
4846         int commandoffset = thread->commandoffset;
4847         while (commandoffset != endoffset)
4848         {
4849                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4850                 switch (command->opcode)
4851                 {
4852 #define INTERPCOMMAND(name) \
4853                 case DPSOFTRAST_OPCODE_##name : \
4854                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4855                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4856                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4857                                 commandoffset = 0; \
4858                         break;
4859                 INTERPCOMMAND(Viewport)
4860                 INTERPCOMMAND(ClearColor)
4861                 INTERPCOMMAND(ClearDepth)
4862                 INTERPCOMMAND(ColorMask)
4863                 INTERPCOMMAND(DepthTest)
4864                 INTERPCOMMAND(ScissorTest)
4865                 INTERPCOMMAND(Scissor)
4866                 INTERPCOMMAND(BlendFunc)
4867                 INTERPCOMMAND(BlendSubtract)
4868                 INTERPCOMMAND(DepthMask)
4869                 INTERPCOMMAND(DepthFunc)
4870                 INTERPCOMMAND(DepthRange)
4871                 INTERPCOMMAND(PolygonOffset)
4872                 INTERPCOMMAND(CullFace)
4873                 INTERPCOMMAND(AlphaTest)
4874                 INTERPCOMMAND(AlphaFunc)
4875                 INTERPCOMMAND(SetTexture)
4876                 INTERPCOMMAND(SetShader)
4877                 INTERPCOMMAND(Uniform4f)
4878                 INTERPCOMMAND(UniformMatrix4f)
4879                 INTERPCOMMAND(Uniform1i)
4880
4881                 case DPSOFTRAST_OPCODE_Draw:
4882                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4883                         commandoffset += command->commandsize;
4884                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4885                                 commandoffset = 0;
4886                         thread->commandoffset = commandoffset;
4887                         break;
4888
4889                 case DPSOFTRAST_OPCODE_Reset:
4890                         commandoffset = 0;
4891                         break;
4892                 }
4893         }
4894         thread->commandoffset = commandoffset;
4895 }
4896
4897 static int DPSOFTRAST_Draw_Thread(void *data)
4898 {
4899         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4900         while(thread->index >= 0)
4901         {
4902                 if (thread->commandoffset != dpsoftrast.drawcommand)
4903                 {
4904                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4905                 }
4906                 else 
4907                 {
4908                         Thread_LockMutex(thread->drawmutex);
4909                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4910                         {
4911                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4912                                 thread->starving = true;
4913                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
4914                                 thread->starving = false;
4915                         }
4916                         Thread_UnlockMutex(thread->drawmutex);
4917                 }
4918         }   
4919         return 0;
4920 }
4921
4922 static void DPSOFTRAST_Draw_FlushThreads(void)
4923 {
4924         DPSOFTRAST_State_Thread *thread;
4925         int i;
4926         DPSOFTRAST_Draw_SyncCommands();
4927         if (dpsoftrast.usethreads) 
4928         {
4929                 for (i = 0; i < dpsoftrast.numthreads; i++)
4930                 {
4931                         thread = &dpsoftrast.threads[i];
4932                         if (thread->commandoffset != dpsoftrast.drawcommand)
4933                         {
4934                                 Thread_LockMutex(thread->drawmutex);
4935                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4936                                         Thread_CondSignal(thread->drawcond);
4937                                 Thread_UnlockMutex(thread->drawmutex);
4938                         }
4939                 }
4940                 for (i = 0; i < dpsoftrast.numthreads; i++)
4941                 {
4942                         thread = &dpsoftrast.threads[i];
4943                         if (thread->commandoffset != dpsoftrast.drawcommand)
4944                         {
4945                                 Thread_LockMutex(thread->drawmutex);
4946                                 if (thread->commandoffset != dpsoftrast.drawcommand)
4947                                 {
4948                                         thread->waiting = true;
4949                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
4950                                         thread->waiting = false;
4951                                 }
4952                                 Thread_UnlockMutex(thread->drawmutex);
4953                         }
4954                 }
4955         }
4956         else
4957         {
4958                 for (i = 0; i < dpsoftrast.numthreads; i++)
4959                 {
4960                         thread = &dpsoftrast.threads[i];
4961                         if (thread->commandoffset != dpsoftrast.drawcommand)
4962                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4963                 }
4964         }
4965         dpsoftrast.commandpool.usedcommands = 0;
4966 }
4967
4968 void DPSOFTRAST_Flush(void)
4969 {
4970         DPSOFTRAST_Draw_FlushThreads();
4971 }
4972
4973 void DPSOFTRAST_Finish(void)
4974 {
4975         DPSOFTRAST_Flush();
4976 }
4977
4978 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4979 {
4980         int i;
4981         union
4982         {
4983                 int i;
4984                 unsigned char b[4];
4985         }
4986         u;
4987         u.i = 1;
4988         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4989         dpsoftrast.bigendian = u.b[3];
4990         dpsoftrast.fb_width = width;
4991         dpsoftrast.fb_height = height;
4992         dpsoftrast.fb_depthpixels = depthpixels;
4993         dpsoftrast.fb_colorpixels[0] = colorpixels;
4994         dpsoftrast.fb_colorpixels[1] = NULL;
4995         dpsoftrast.fb_colorpixels[1] = NULL;
4996         dpsoftrast.fb_colorpixels[1] = NULL;
4997         dpsoftrast.viewport[0] = 0;
4998         dpsoftrast.viewport[1] = 0;
4999         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5000         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5001         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5002         dpsoftrast.texture_firstfree = 1;
5003         dpsoftrast.texture_end = 1;
5004         dpsoftrast.texture_max = 0;
5005         dpsoftrast.color[0] = 1;
5006         dpsoftrast.color[1] = 1;
5007         dpsoftrast.color[2] = 1;
5008         dpsoftrast.color[3] = 1;
5009         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5010         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5011         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5012         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5013         for (i = 0; i < dpsoftrast.numthreads; i++)
5014         {
5015                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5016                 thread->index = i;
5017                 thread->cullface = GL_BACK;
5018                 thread->colormask[1] = 1;
5019                 thread->colormask[2] = 1;
5020                 thread->colormask[3] = 1;
5021                 thread->blendfunc[0] = GL_ONE;
5022                 thread->blendfunc[1] = GL_ZERO;
5023                 thread->depthmask = true;
5024                 thread->depthtest = true;
5025                 thread->depthfunc = GL_LEQUAL;
5026                 thread->scissortest = false;
5027                 thread->alphatest = false;
5028                 thread->alphafunc = GL_GREATER;
5029                 thread->alphavalue = 0.5f;
5030                 thread->viewport[0] = 0;
5031                 thread->viewport[1] = 0;
5032                 thread->viewport[2] = dpsoftrast.fb_width;
5033                 thread->viewport[3] = dpsoftrast.fb_height;
5034                 thread->scissor[0] = 0;
5035                 thread->scissor[1] = 0;
5036                 thread->scissor[2] = dpsoftrast.fb_width;
5037                 thread->scissor[3] = dpsoftrast.fb_height;
5038                 thread->depthrange[0] = 0;
5039                 thread->depthrange[1] = 1;
5040                 thread->polygonoffset[0] = 0;
5041                 thread->polygonoffset[1] = 0;
5042         
5043                 if (dpsoftrast.interlace)
5044                 {
5045                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5046                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5047                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5048                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5049                 }
5050                 else
5051                 {
5052                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5053                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5054                 }
5055
5056                 thread->numspans = 0;
5057                 thread->numtriangles = 0;
5058                 thread->commandoffset = 0;
5059                 thread->waiting = false;
5060                 thread->starving = false;
5061            
5062                 thread->validate = -1;
5063                 DPSOFTRAST_Validate(thread, -1);
5064  
5065                 if (dpsoftrast.usethreads)
5066                 {
5067                         thread->waitcond = Thread_CreateCond();
5068                         thread->drawcond = Thread_CreateCond();
5069                         thread->drawmutex = Thread_CreateMutex();
5070                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5071                 }
5072         }
5073         return 0;
5074 }
5075
5076 void DPSOFTRAST_Shutdown(void)
5077 {
5078         int i;
5079         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5080         {
5081                 DPSOFTRAST_State_Thread *thread;
5082                 for (i = 0; i < dpsoftrast.numthreads; i++)
5083                 {
5084                         thread = &dpsoftrast.threads[i];
5085                         Thread_LockMutex(thread->drawmutex);
5086                         thread->index = -1;
5087                         Thread_CondSignal(thread->drawcond);
5088                         Thread_UnlockMutex(thread->drawmutex);
5089                         Thread_WaitThread(thread->thread, 0);
5090                         Thread_DestroyCond(thread->waitcond);
5091                         Thread_DestroyCond(thread->drawcond);
5092                         Thread_DestroyMutex(thread->drawmutex);
5093                 }
5094         }
5095         for (i = 0;i < dpsoftrast.texture_end;i++)
5096                 if (dpsoftrast.texture[i].bytes)
5097                         MM_FREE(dpsoftrast.texture[i].bytes);
5098         if (dpsoftrast.texture)
5099                 free(dpsoftrast.texture);
5100         if (dpsoftrast.threads)
5101                 MM_FREE(dpsoftrast.threads);
5102         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5103 }
5104