]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
dpsoftrast: speed up deluxemapping a bit; support tangentspace deluxemapping too...
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237
238         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
239         
240         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
242
243         // DPSOFTRAST_VALIDATE_ flags
244         int validate;
245
246         // derived values (DPSOFTRAST_VALIDATE_FB)
247         int fb_colormask;
248         int fb_scissor[4];
249         ALIGN(float fb_viewportcenter[4]);
250         ALIGN(float fb_viewportscale[4]);
251
252         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253         int fb_depthfunc;
254
255         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
256         int fb_blendmode;
257
258         // band boundaries
259         int miny1;
260         int maxy1;
261         int miny2;
262         int maxy2;
263
264         ATOMIC(volatile int commandoffset);
265
266         volatile bool waiting;
267         volatile bool starving;
268         void *waitcond;
269         void *drawcond;
270         void *drawmutex;
271
272         int numspans;
273         int numtriangles;
274         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
276 }
277 DPSOFTRAST_State_Thread);
278
279 typedef ATOMIC(struct DPSOFTRAST_State_s
280 {
281         int fb_width;
282         int fb_height;
283         unsigned int *fb_depthpixels;
284         unsigned int *fb_colorpixels[4];
285
286         int viewport[4];
287         ALIGN(float fb_viewportcenter[4]);
288         ALIGN(float fb_viewportscale[4]);
289
290         float color[4];
291         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
293
294         const float *pointer_vertex3f;
295         const float *pointer_color4f;
296         const unsigned char *pointer_color4ub;
297         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298         int stride_vertex;
299         int stride_color;
300         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
303
304         int firstvertex;
305         int numvertices;
306         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307         float *screencoord4f;
308         int drawstarty;
309         int drawendy;
310         int drawclipped;
311         
312         int shader_mode;
313         int shader_permutation;
314
315         int texture_max;
316         int texture_end;
317         int texture_firstfree;
318         DPSOFTRAST_Texture *texture;
319
320         int bigendian;
321
322         // error reporting
323         const char *errorstring;
324
325         bool usethreads;
326         int interlace;
327         int numthreads;
328         DPSOFTRAST_State_Thread *threads;
329
330         ATOMIC(volatile int drawcommand);
331
332         DPSOFTRAST_State_Command_Pool commandpool;
333 }
334 DPSOFTRAST_State);
335
336 DPSOFTRAST_State dpsoftrast;
337
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
343
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
345 {
346         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348         fb_viewportcenter[3] = 0.5f;
349         fb_viewportcenter[0] = 0.0f;
350         fb_viewportscale[1] = 0.5f * viewport[2];
351         fb_viewportscale[2] = -0.5f * viewport[3];
352         fb_viewportscale[3] = 0.5f;
353         fb_viewportscale[0] = 1.0f;
354 }
355
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
357 {
358         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359         // and viewport projection values
360         int x1, x2;
361         int y1, y2;
362         x1 = thread->scissor[0];
363         x2 = thread->scissor[0] + thread->scissor[2];
364         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365         y2 = dpsoftrast.fb_height - thread->scissor[1];
366         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
367         if (x1 < 0) x1 = 0;
368         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
369         if (y1 < 0) y1 = 0;
370         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371         thread->fb_scissor[0] = x1;
372         thread->fb_scissor[1] = y1;
373         thread->fb_scissor[2] = x2 - x1;
374         thread->fb_scissor[3] = y2 - y1;
375
376         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
377 }
378
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
380 {
381         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
382 }
383
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
385 {
386         if (thread->blendsubtract)
387         {
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
394                 }
395         }
396         else
397         {       
398                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
399                 {
400                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
411                 }
412         }
413 }
414
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
416
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
418 {
419         mask &= thread->validate;
420         if (!mask)
421                 return;
422         if (mask & DPSOFTRAST_VALIDATE_FB)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425                 DPSOFTRAST_RecalcFB(thread);
426         }
427         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
428         {
429                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430                 DPSOFTRAST_RecalcDepthFunc(thread);
431         }
432         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
433         {
434                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435                 DPSOFTRAST_RecalcBlendFunc(thread);
436         }
437 }
438
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
440 {
441         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442                 return &dpsoftrast.texture[index];
443         return NULL;
444 }
445
446 static void DPSOFTRAST_Texture_Grow(void)
447 {
448         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449         DPSOFTRAST_State_Thread *thread;
450         int i;
451         int j;
452         DPSOFTRAST_Flush();
453         // expand texture array as needed
454         if (dpsoftrast.texture_max < 1024)
455                 dpsoftrast.texture_max = 1024;
456         else
457                 dpsoftrast.texture_max *= 2;
458         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460                 if (dpsoftrast.texbound[i])
461                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462         for (j = 0; j < dpsoftrast.numthreads; j++)
463         {
464                 thread = &dpsoftrast.threads[j];
465                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466                         if (thread->texbound[i])
467                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468         }
469 }
470
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
472 {
473         int w;
474         int h;
475         int d;
476         int size;
477         int s;
478         int texnum;
479         int mipmaps;
480         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482         DPSOFTRAST_Texture *texture;
483         if (width*height*depth < 1)
484         {
485                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
486                 return 0;
487         }
488         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
489         {
490                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
491                 return 0;
492         }
493         switch(texformat)
494         {
495         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
498                 break;
499         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
503                         return 0;
504                 }
505                 if (depth != 1)
506                 {
507                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
508                         return 0;
509                 }
510                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
511                 {
512                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
513                         return 0;
514                 }
515                 break;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
525                 return 0;
526         }
527         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
530                 return 0;
531         }
532         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
533         {
534                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
535                 return 0;
536         }
537         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
538         {
539                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
540                 return 0;
541         }
542         // find first empty slot in texture array
543         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544                 if (!dpsoftrast.texture[texnum].bytes)
545                         break;
546         dpsoftrast.texture_firstfree = texnum + 1;
547         if (dpsoftrast.texture_max <= texnum)
548                 DPSOFTRAST_Texture_Grow();
549         if (dpsoftrast.texture_end <= texnum)
550                 dpsoftrast.texture_end = texnum + 1;
551         texture = &dpsoftrast.texture[texnum];
552         memset(texture, 0, sizeof(*texture));
553         texture->flags = flags;
554         texture->width = width;
555         texture->height = height;
556         texture->depth = depth;
557         texture->sides = sides;
558         texture->binds = 0;
559         w = width;
560         h = height;
561         d = depth;
562         size = 0;
563         mipmaps = 0;
564         w = width;
565         h = height;
566         d = depth;
567         for (;;)
568         {
569                 s = w * h * d * sides * 4;
570                 texture->mipmap[mipmaps][0] = size;
571                 texture->mipmap[mipmaps][1] = s;
572                 texture->mipmap[mipmaps][2] = w;
573                 texture->mipmap[mipmaps][3] = h;
574                 texture->mipmap[mipmaps][4] = d;
575                 size += s;
576                 mipmaps++;
577                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578                         break;
579                 if (w > 1) w >>= 1;
580                 if (h > 1) h >>= 1;
581                 if (d > 1) d >>= 1;
582         }
583         texture->mipmaps = mipmaps;
584         texture->size = size;
585
586         // allocate the pixels now
587         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588
589         return texnum;
590 }
591 void DPSOFTRAST_Texture_Free(int index)
592 {
593         DPSOFTRAST_Texture *texture;
594         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595         if (texture->binds)
596                 DPSOFTRAST_Flush();
597         if (texture->bytes)
598                 MM_FREE(texture->bytes);
599         texture->bytes = NULL;
600         memset(texture, 0, sizeof(*texture));
601         // adjust the free range and used range
602         if (dpsoftrast.texture_firstfree > index)
603                 dpsoftrast.texture_firstfree = index;
604         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605                 dpsoftrast.texture_end--;
606 }
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
608 {
609         int i, x, y, z, w, layer0, layer1, row0, row1;
610         unsigned char *o, *i0, *i1, *i2, *i3;
611         DPSOFTRAST_Texture *texture;
612         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613         if (texture->mipmaps <= 1)
614                 return;
615         for (i = 1;i < texture->mipmaps;i++)
616         {
617                 for (z = 0;z < texture->mipmap[i][4];z++)
618                 {
619                         layer0 = z*2;
620                         layer1 = z*2+1;
621                         if (layer1 >= texture->mipmap[i-1][4])
622                                 layer1 = texture->mipmap[i-1][4]-1;
623                         for (y = 0;y < texture->mipmap[i][3];y++)
624                         {
625                                 row0 = y*2;
626                                 row1 = y*2+1;
627                                 if (row1 >= texture->mipmap[i-1][3])
628                                         row1 = texture->mipmap[i-1][3]-1;
629                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
630                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634                                 w = texture->mipmap[i][2];
635                                 if (layer1 > layer0)
636                                 {
637                                         if (texture->mipmap[i-1][2] > 1)
638                                         {
639                                                 // average 3D texture
640                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
641                                                 {
642                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
646                                                 }
647                                         }
648                                         else
649                                         {
650                                                 // average 3D mipmap with parent width == 1
651                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
652                                                 {
653                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
657                                                 }
658                                         }
659                                 }
660                                 else
661                                 {
662                                         if (texture->mipmap[i-1][2] > 1)
663                                         {
664                                                 // average 2D texture (common case)
665                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
666                                                 {
667                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
671                                                 }
672                                         }
673                                         else
674                                         {
675                                                 // 2D texture with parent width == 1
676                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
677                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
678                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
679                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
680                                         }
681                                 }
682                         }
683                 }
684         }
685 }
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
687 {
688         DPSOFTRAST_Texture *texture;
689         unsigned char *dst;
690         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
691         if (texture->binds)
692                 DPSOFTRAST_Flush();
693         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694         while (blockheight > 0)
695         {
696                 memcpy(dst, pixels, blockwidth * 4);
697                 pixels += blockwidth * 4;
698                 dst += texture->mipmap[0][2] * 4;
699                 blockheight--;
700         }
701         DPSOFTRAST_Texture_CalculateMipmaps(index);
702 }
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
704 {
705         DPSOFTRAST_Texture *texture;
706         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
707         if (texture->binds)
708                 DPSOFTRAST_Flush();
709         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710         DPSOFTRAST_Texture_CalculateMipmaps(index);
711 }
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
713 {
714         DPSOFTRAST_Texture *texture;
715         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716         return texture->mipmap[mip][2];
717 }
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
719 {
720         DPSOFTRAST_Texture *texture;
721         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722         return texture->mipmap[mip][3];
723 }
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728         return texture->mipmap[mip][4];
729 }
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734         if (texture->binds)
735                 DPSOFTRAST_Flush();
736         return texture->bytes + texture->mipmap[mip][0];
737 }
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
739 {
740         DPSOFTRAST_Texture *texture;
741         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
743         {
744                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
745                 return;
746         }
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         texture->filter = filter;
750 }
751
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
753 {
754         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
757                 DPSOFTRAST_Flush();
758         dpsoftrast.fb_width = width;
759         dpsoftrast.fb_height = height;
760         dpsoftrast.fb_depthpixels = depthpixels;
761         dpsoftrast.fb_colorpixels[0] = colorpixels0;
762         dpsoftrast.fb_colorpixels[1] = colorpixels1;
763         dpsoftrast.fb_colorpixels[2] = colorpixels2;
764         dpsoftrast.fb_colorpixels[3] = colorpixels3;
765 }
766
767 static void DPSOFTRAST_Draw_FlushThreads(void);
768
769 static void DPSOFTRAST_Draw_SyncCommands(void)
770 {
771         if(dpsoftrast.usethreads) MEMORY_BARRIER;
772         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
773 }
774
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
776 {
777         DPSOFTRAST_State_Thread *thread;
778         int i;
779         int freecommand = dpsoftrast.commandpool.freecommand;
780         int usedcommands = dpsoftrast.commandpool.usedcommands;
781         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
782                 return;
783         DPSOFTRAST_Draw_SyncCommands();
784         for(;;)
785         {
786                 int waitindex = -1;
787                 int commandoffset;
788                 usedcommands = 0;
789                 for (i = 0; i < dpsoftrast.numthreads; i++)
790                 {
791                         thread = &dpsoftrast.threads[i]; 
792                         commandoffset = freecommand - thread->commandoffset;
793                         if (commandoffset < 0)
794                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795                         if (commandoffset > usedcommands)
796                         {
797                                 waitindex = i;
798                                 usedcommands = commandoffset;
799                         }
800                 }
801                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
802                         break;
803                 thread = &dpsoftrast.threads[waitindex];
804                 Thread_LockMutex(thread->drawmutex);
805                 if (thread->commandoffset != dpsoftrast.drawcommand)
806                 {
807                         thread->waiting = true;
808                         if (thread->starving) Thread_CondSignal(thread->drawcond);
809                         Thread_CondWait(thread->waitcond, thread->drawmutex);
810                         thread->waiting = false;
811                 }
812                 Thread_UnlockMutex(thread->drawmutex);
813         }
814         dpsoftrast.commandpool.usedcommands = usedcommands;
815 }
816
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
821
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
823 {
824         DPSOFTRAST_Command *command;
825         int freecommand = dpsoftrast.commandpool.freecommand;
826         int usedcommands = dpsoftrast.commandpool.usedcommands;
827         int extra = sizeof(DPSOFTRAST_Command);
828         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
831         {
832                 if (dpsoftrast.usethreads)
833                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 else
835                         DPSOFTRAST_Draw_FlushThreads();
836                 freecommand = dpsoftrast.commandpool.freecommand;
837                 usedcommands = dpsoftrast.commandpool.usedcommands;
838         }
839         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
840         {
841                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842                 command->opcode = DPSOFTRAST_OPCODE_Reset;
843                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844                 freecommand = 0;
845         }
846         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847         command->opcode = opcode;
848         command->commandsize = size;
849         freecommand += size;
850         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
851                 freecommand = 0;
852         dpsoftrast.commandpool.freecommand = freecommand;
853         dpsoftrast.commandpool.usedcommands = usedcommands + size;
854         return command;
855 }
856
857 static void DPSOFTRAST_UndoCommand(int size)
858 {
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         freecommand -= size;
862         if (freecommand < 0)
863                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864         usedcommands -= size;
865         dpsoftrast.commandpool.freecommand = freecommand;
866         dpsoftrast.commandpool.usedcommands = usedcommands;
867 }
868                 
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
871 {
872         thread->viewport[0] = command->x;
873         thread->viewport[1] = command->y;
874         thread->viewport[2] = command->width;
875         thread->viewport[3] = command->height;
876         thread->validate |= DPSOFTRAST_VALIDATE_FB;
877 }
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
879 {
880         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881         command->x = x;
882         command->y = y;
883         command->width = width;
884         command->height = height;
885
886         dpsoftrast.viewport[0] = x;
887         dpsoftrast.viewport[1] = y;
888         dpsoftrast.viewport[2] = width;
889         dpsoftrast.viewport[3] = height;
890         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 }
892
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
895 {
896         int i, x1, y1, x2, y2, w, h, x, y;
897         int miny1 = thread->miny1;
898         int maxy1 = thread->maxy1;
899         int miny2 = thread->miny2;
900         int maxy2 = thread->maxy2;
901         int bandy;
902         unsigned int *p;
903         unsigned int c;
904         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905         x1 = thread->fb_scissor[0];
906         y1 = thread->fb_scissor[1];
907         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909         if (y1 < miny1) y1 = miny1;
910         if (y2 > maxy2) y2 = maxy2;
911         w = x2 - x1;
912         h = y2 - y1;
913         if (w < 1 || h < 1)
914                 return;
915         // FIXME: honor fb_colormask?
916         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917         for (i = 0;i < 4;i++)
918         {
919                 if (!dpsoftrast.fb_colorpixels[i])
920                         continue;
921                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922                 for (;y < bandy;y++)
923                 {
924                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925                         for (x = x1;x < x2;x++)
926                                 p[x] = c;
927                 }
928         }
929 }
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
931 {
932         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
933         command->r = r;
934         command->g = g;
935         command->b = b;
936         command->a = a;
937 }
938
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
941 {
942         int x1, y1, x2, y2, w, h, x, y;
943         int miny1 = thread->miny1;
944         int maxy1 = thread->maxy1;
945         int miny2 = thread->miny2;
946         int maxy2 = thread->maxy2;
947         int bandy;
948         unsigned int *p;
949         unsigned int c;
950         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963         for (;y < bandy;y++)
964         {
965                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966                 for (x = x1;x < x2;x++)
967                         p[x] = c;
968         }
969 }
970 void DPSOFTRAST_ClearDepth(float d)
971 {
972         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
973         command->depth = d;
974 }
975
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
978 {
979         thread->colormask[0] = command->r != 0;
980         thread->colormask[1] = command->g != 0;
981         thread->colormask[2] = command->b != 0;
982         thread->colormask[3] = command->a != 0;
983         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
984 }
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
986 {
987         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
996 {
997         thread->depthtest = command->enable;
998         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
999 }
1000 void DPSOFTRAST_DepthTest(int enable)
1001 {
1002         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003         command->enable = enable;
1004 }
1005
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1008 {
1009         thread->scissortest = command->enable;
1010         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1011 }
1012 void DPSOFTRAST_ScissorTest(int enable)
1013 {
1014         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015         command->enable = enable;
1016 }
1017
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1020 {
1021         thread->scissor[0] = command->x;
1022         thread->scissor[1] = command->y;
1023         thread->scissor[2] = command->width;
1024         thread->scissor[3] = command->height;
1025         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 }
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1028 {
1029         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030         command->x = x;
1031         command->y = y;
1032         command->width = width;
1033         command->height = height;
1034 }
1035
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1038 {
1039         thread->blendfunc[0] = command->sfactor;
1040         thread->blendfunc[1] = command->dfactor;
1041         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1042 }
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1044 {
1045         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046         command->sfactor = sfactor;
1047         command->dfactor = dfactor;
1048 }
1049
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1052 {
1053         thread->blendsubtract = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1055 }
1056 void DPSOFTRAST_BlendSubtract(int enable)
1057 {
1058         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1064 {
1065         thread->depthmask = command->enable;
1066 }
1067 void DPSOFTRAST_DepthMask(int enable)
1068 {
1069         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070         command->enable = enable;
1071 }
1072
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1075 {
1076         thread->depthfunc = command->func;
1077 }
1078 void DPSOFTRAST_DepthFunc(int func)
1079 {
1080         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081         command->func = func;
1082 }
1083
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1086 {
1087         thread->depthrange[0] = command->nearval;
1088         thread->depthrange[1] = command->farval;
1089 }
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1091 {
1092         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093         command->nearval = nearval;
1094         command->farval = farval;
1095 }
1096
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1099 {
1100         thread->polygonoffset[0] = command->alongnormal;
1101         thread->polygonoffset[1] = command->intoview;
1102 }
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1104 {
1105         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106         command->alongnormal = alongnormal;
1107         command->intoview = intoview;
1108 }
1109
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1112 {
1113         thread->cullface = command->mode;
1114 }
1115 void DPSOFTRAST_CullFace(int mode)
1116 {
1117         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118         command->mode = mode;
1119 }
1120
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1123 {
1124         thread->alphatest = command->enable;
1125 }
1126 void DPSOFTRAST_AlphaTest(int enable)
1127 {
1128         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129         command->enable = enable;
1130 }
1131
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1134 {
1135         thread->alphafunc = command->func;
1136         thread->alphavalue = command->ref;
1137 }
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1139 {
1140         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141         command->func = func;
1142         command->ref = ref;
1143 }
1144
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1146 {
1147         dpsoftrast.color[0] = r;
1148         dpsoftrast.color[1] = g;
1149         dpsoftrast.color[2] = b;
1150         dpsoftrast.color[3] = a;
1151 }
1152
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1154 {
1155         int outstride = blockwidth * 4;
1156         int instride = dpsoftrast.fb_width * 4;
1157         int bx1 = blockx;
1158         int by1 = blocky;
1159         int bx2 = blockx + blockwidth;
1160         int by2 = blocky + blockheight;
1161         int bw;
1162         int x;
1163         int y;
1164         unsigned char *inpixels;
1165         unsigned char *b;
1166         unsigned char *o;
1167         DPSOFTRAST_Flush();
1168         if (bx1 < 0) bx1 = 0;
1169         if (by1 < 0) by1 = 0;
1170         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1172         bw = bx2 - bx1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         DPSOFTRAST_Flush();
1227         spixels = dpsoftrast.fb_colorpixels[0];
1228         swidth = dpsoftrast.fb_width;
1229         sheight = dpsoftrast.fb_height;
1230         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231         twidth = texture->mipmap[mip][2];
1232         theight = texture->mipmap[mip][3];
1233         if (tx1 < 0) tx1 = 0;
1234         if (ty1 < 0) ty1 = 0;
1235         if (tx2 > twidth) tx2 = twidth;
1236         if (ty2 > theight) ty2 = theight;
1237         if (sx1 < 0) sx1 = 0;
1238         if (sy1 < 0) sy1 = 0;
1239         if (sx2 > swidth) sx2 = swidth;
1240         if (sy2 > sheight) sy2 = sheight;
1241         tw = tx2 - tx1;
1242         th = ty2 - ty1;
1243         sw = sx2 - sx1;
1244         sh = sy2 - sy1;
1245         if (tw > sw) tw = sw;
1246         if (th > sh) th = sh;
1247         if (tw < 1 || th < 1)
1248                 return;
1249         sy1 = sheight - 1 - sy1;
1250         for (y = 0;y < th;y++)
1251                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252         if (texture->mipmaps > 1)
1253                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1254 }
1255
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 {
1259         if (thread->texbound[command->unitnum])
1260                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->unitnum = unitnum;
1281         command->texture = texture;
1282
1283         dpsoftrast.texbound[unitnum] = texture;
1284         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->mode = mode;
1321         command->permutation = permutation;
1322
1323         dpsoftrast.shader_mode = mode;
1324         dpsoftrast.shader_permutation = permutation;
1325 }
1326
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 {
1330         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 }
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         command->val[0] = v0;
1337         command->val[1] = v1;
1338         command->val[2] = v2;
1339         command->val[3] = v3;
1340
1341         dpsoftrast.uniform4f[index*4+0] = v0;
1342         dpsoftrast.uniform4f[index*4+1] = v1;
1343         dpsoftrast.uniform4f[index*4+2] = v2;
1344         dpsoftrast.uniform4f[index*4+3] = v3;
1345 }
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 {
1348         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->index = (DPSOFTRAST_UNIFORM)index;
1369                 if (((size_t)v)&(ALIGN_SIZE-1))
1370                 {
1371                         m0 = _mm_loadu_ps(v);
1372                         m1 = _mm_loadu_ps(v+4);
1373                         m2 = _mm_loadu_ps(v+8);
1374                         m3 = _mm_loadu_ps(v+12);
1375                 }
1376                 else
1377                 {
1378                         m0 = _mm_load_ps(v);
1379                         m1 = _mm_load_ps(v+4);
1380                         m2 = _mm_load_ps(v+8);
1381                         m3 = _mm_load_ps(v+12);
1382                 }
1383                 if (transpose)
1384                 {
1385                         __m128 t0, t1, t2, t3;
1386                         t0 = _mm_unpacklo_ps(m0, m1);
1387                         t1 = _mm_unpacklo_ps(m2, m3);
1388                         t2 = _mm_unpackhi_ps(m0, m1);
1389                         t3 = _mm_unpackhi_ps(m2, m3);
1390                         m0 = _mm_movelh_ps(t0, t1);
1391                         m1 = _mm_movehl_ps(t1, t0);
1392                         m2 = _mm_movelh_ps(t2, t3);
1393                         m3 = _mm_movehl_ps(t3, t2);                     
1394                 }
1395                 _mm_store_ps(command->val, m0);
1396                 _mm_store_ps(command->val+4, m1);
1397                 _mm_store_ps(command->val+8, m2);
1398                 _mm_store_ps(command->val+12, m3);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1403         }
1404 #endif
1405 }
1406
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 {
1410         thread->uniform1i[command->index] = command->val;
1411 }
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 {
1414         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415         command->index = index;
1416         command->val = i0;
1417
1418         dpsoftrast.uniform1i[command->index] = i0;
1419 }
1420
1421 #ifdef SSE2_PRESENT
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 {
1424         float *end = dst + size*4;
1425         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1426         {
1427                 while (dst < end)
1428                 {
1429                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430                         dst += 4;
1431                         src += stride;
1432                 }
1433         }
1434         else
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443 }
1444
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 {
1447         float *end = dst + size*4;
1448         if (stride == sizeof(float[3]))
1449         {
1450                 float *end4 = dst + (size&~3)*4;        
1451                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1452                 {
1453                         while (dst < end4)
1454                         {
1455                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1456                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dst += 16;
1469                                 src += 4*sizeof(float[3]);
1470                         }
1471                 }
1472                 else
1473                 {
1474                         while (dst < end4)
1475                         {
1476                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dst += 16;
1490                                 src += 4*sizeof(float[3]);
1491                         }
1492                 }
1493         }
1494         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1495         {
1496                 while (dst < end)
1497                 {
1498                         __m128 v = _mm_loadu_ps((const float *)src);
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502                         _mm_store_ps(dst, v);
1503                         dst += 4;
1504                         src += stride;
1505                 }
1506         }
1507         else
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_load_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520 }
1521
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 {
1524         float *end = dst + size*4;
1525         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526         if (stride == sizeof(float[2]))
1527         {
1528                 float *end2 = dst + (size&~1)*4;
1529                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1530                 {
1531                         while (dst < end2)
1532                         {
1533                                 __m128 v = _mm_loadu_ps((const float *)src);
1534                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536                                 dst += 8;
1537                                 src += 2*sizeof(float[2]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end2)
1543                         {
1544                                 __m128 v = _mm_load_ps((const float *)src);
1545                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547                                 dst += 8;
1548                                 src += 2*sizeof(float[2]);
1549                         }
1550                 }
1551         }
1552         while (dst < end)
1553         {
1554                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1555                 dst += 4;
1556                 src += stride;
1557         }
1558 }
1559
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 {
1562         float *end = dst + size*4;
1563         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564         if (stride == sizeof(unsigned char[4]))
1565         {
1566                 float *end4 = dst + (size&~3)*4;
1567                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1568                 {
1569                         while (dst < end4)
1570                         {
1571                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576                                 dst += 16;
1577                                 src += 4*sizeof(unsigned char[4]);
1578                         }
1579                 }
1580                 else
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 {
1605         float *end = dst + 4*size;
1606         __m128 v = _mm_loadu_ps(src);
1607         while (dst < end)
1608         {
1609                 _mm_store_ps(dst, v);
1610                 dst += 4;
1611         }
1612 }
1613 #endif
1614
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1616 {
1617 #ifdef SSE2_PRESENT
1618         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619         __m128 m0, m1, m2, m3;
1620         float *end;
1621         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622         {
1623                 // fast case for identity matrix
1624                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1625                 return;
1626         }
1627         end = out4f + numitems*4;
1628         m0 = _mm_loadu_ps(inmatrix16f);
1629         m1 = _mm_loadu_ps(inmatrix16f + 4);
1630         m2 = _mm_loadu_ps(inmatrix16f + 8);
1631         m3 = _mm_loadu_ps(inmatrix16f + 12);
1632         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_loadu_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646         else
1647         {
1648                 while (out4f < end)
1649                 {
1650                         __m128 v = _mm_load_ps(in4f);
1651                         _mm_store_ps(out4f,
1652                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656                         out4f += 4;
1657                         in4f += 4;
1658                 }
1659         }
1660 #endif
1661 }
1662
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 {
1665         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666 }
1667
1668 #ifdef SSE2_PRESENT
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 { \
1679         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1683 }
1684
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1686 { \
1687         __m128 p = (in); \
1688         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1692 }
1693
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 {
1696         int clipmask = 0xFF;
1697         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702         #define BBFRONT(k, pos) \
1703         { \
1704                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1707                 { \
1708                         __m128 proj; \
1709                         clipmask &= ~(1<<k); \
1710                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                         minproj = _mm_min_ss(minproj, proj); \
1712                         maxproj = _mm_max_ss(maxproj, proj); \
1713                 } \
1714         }
1715         BBFRONT(0, minpos); 
1716         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1717         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1721         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1722         BBFRONT(7, maxpos);
1723         #define BBCLIP(k) \
1724         { \
1725                 if (clipmask&(1<<k)) \
1726                 { \
1727                         if (!(clipmask&(1<<(k^1)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^2)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                         if (!(clipmask&(1<<(k^4)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                 } \
1752         }
1753         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760         *starty = _mm_cvttss_si32(maxproj);
1761         *endy = _mm_cvttss_si32(minproj)+1;
1762         return clipmask;
1763 }
1764         
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1766 {
1767         float *end = out4f + numitems*4;
1768         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769         __m128 minpos, maxpos;
1770         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1771         {
1772                 minpos = maxpos = _mm_loadu_ps(in4f);
1773                 while (out4f < end)
1774                 {
1775                         __m128 v = _mm_loadu_ps(in4f);
1776                         minpos = _mm_min_ps(minpos, v);
1777                         maxpos = _mm_max_ps(maxpos, v);
1778                         _mm_store_ps(out4f, v);
1779                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780                         _mm_store_ps(screen4f, v);
1781                         in4f += 4;
1782                         out4f += 4;
1783                         screen4f += 4;
1784                 }
1785         }
1786         else
1787         {
1788                 minpos = maxpos = _mm_load_ps(in4f);
1789                 while (out4f < end)
1790                 {
1791                         __m128 v = _mm_load_ps(in4f);
1792                         minpos = _mm_min_ps(minpos, v);
1793                         maxpos = _mm_max_ps(maxpos, v);
1794                         _mm_store_ps(out4f, v);
1795                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796                         _mm_store_ps(screen4f, v);
1797                         in4f += 4;
1798                         out4f += 4;
1799                         screen4f += 4;
1800                 }
1801         }
1802         if (starty && endy) 
1803                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1804                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1808         return 0;
1809 }
1810
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1812 {
1813         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1815         float *end;
1816         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818         end = out4f + numitems*4;
1819         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821         m0 = _mm_loadu_ps(inmatrix16f);
1822         m1 = _mm_loadu_ps(inmatrix16f + 4);
1823         m2 = _mm_loadu_ps(inmatrix16f + 8);
1824         m3 = _mm_loadu_ps(inmatrix16f + 12);
1825         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826         {
1827                 minpos = maxpos = _mm_loadu_ps(in4f);
1828                 while (out4f < end)
1829                 {
1830                         __m128 v = _mm_loadu_ps(in4f);
1831                         minpos = _mm_min_ps(minpos, v);
1832                         maxpos = _mm_max_ps(maxpos, v);
1833                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834                         _mm_store_ps(out4f, v);
1835                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836                         _mm_store_ps(screen4f, v);
1837                         in4f += 4;
1838                         out4f += 4;
1839                         screen4f += 4;
1840                 }
1841         }
1842         else
1843         {
1844                 minpos = maxpos = _mm_load_ps(in4f);
1845                 while (out4f < end)
1846                 {
1847                         __m128 v = _mm_load_ps(in4f);
1848                         minpos = _mm_min_ps(minpos, v);
1849                         maxpos = _mm_max_ps(maxpos, v);
1850                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851                         _mm_store_ps(out4f, v);
1852                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853                         _mm_store_ps(screen4f, v);
1854                         in4f += 4;
1855                         out4f += 4;
1856                         screen4f += 4;
1857                 }
1858         }
1859         if (starty && endy) 
1860                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1861         return 0;
1862 }
1863 #endif
1864
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1866 {
1867 #ifdef SSE2_PRESENT
1868         float *outf = dpsoftrast.post_array4f[outarray];
1869         const unsigned char *inb;
1870         int firstvertex = dpsoftrast.firstvertex;
1871         int numvertices = dpsoftrast.numvertices;
1872         int stride;
1873         switch(inarray)
1874         {
1875         case DPSOFTRAST_ARRAY_POSITION:
1876                 stride = dpsoftrast.stride_vertex;
1877                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1879                 break;
1880         case DPSOFTRAST_ARRAY_COLOR:
1881                 stride = dpsoftrast.stride_color;
1882                 if (dpsoftrast.pointer_color4f)
1883                 {
1884                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1886                 }
1887                 else if (dpsoftrast.pointer_color4ub)
1888                 {
1889                         stride = dpsoftrast.stride_color;
1890                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1892                 }
1893                 else
1894                 {
1895                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1896                 }
1897                 break;
1898         default:
1899                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1901                 {
1902                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1904                         {
1905                         case 2:
1906                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1907                                 break;
1908                         case 3:
1909                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1910                                 break;
1911                         case 4:
1912                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1913                                 break;
1914                         }
1915                 }
1916                 break;
1917         }
1918         return outf;
1919 #else
1920         return NULL;
1921 #endif
1922 }
1923
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 {
1926         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1928         return data;
1929 }
1930
1931 #if 0
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1933 {
1934 #ifdef SSE2_PRESENT
1935         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1937         return data;
1938 #else
1939         return NULL;
1940 #endif
1941 }
1942 #endif
1943
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1945 {
1946 #ifdef SSE2_PRESENT
1947         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1949         return data;
1950 #else
1951         return NULL;
1952 #endif
1953 }
1954
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1956 {
1957         int x;
1958         int startx = span->startx;
1959         int endx = span->endx;
1960         float wslope = triangle->w[0];
1961         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962         float endz = 1.0f / (w + wslope * startx);
1963         for (x = startx;x < endx;)
1964         {
1965                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1966                 float z = endz, dz;
1967                 if (nextsub >= endx) nextsub = endsub = endx-1;
1968                 endz = 1.0f / (w + wslope * nextsub);
1969                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970                 for (; x <= endsub; x++, z += dz)
1971                         zf[x] = z;
1972         }
1973 }
1974
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1976 {
1977         int x;
1978         int startx = span->startx;
1979         int endx = span->endx;
1980         int d[4];
1981         float a, b;
1982         unsigned char * RESTRICT pixelmask = span->pixelmask;
1983         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1984         if (!pixel)
1985                 return;
1986         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987         // handle alphatest now (this affects depth writes too)
1988         if (thread->alphatest)
1989                 for (x = startx;x < endx;x++)
1990                         if (in4f[x*4+3] < 0.5f)
1991                                 pixelmask[x] = false;
1992         // FIXME: this does not handle bigendian
1993         switch(thread->fb_blendmode)
1994         {
1995         case DPSOFTRAST_BLENDMODE_OPAQUE:
1996                 for (x = startx;x < endx;x++)
1997                 {
1998                         if (!pixelmask[x])
1999                                 continue;
2000                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004                         pixel[x*4+0] = d[0];
2005                         pixel[x*4+1] = d[1];
2006                         pixel[x*4+2] = d[2];
2007                         pixel[x*4+3] = d[3];
2008                 }
2009                 break;
2010         case DPSOFTRAST_BLENDMODE_ALPHA:
2011                 for (x = startx;x < endx;x++)
2012                 {
2013                         if (!pixelmask[x])
2014                                 continue;
2015                         a = in4f[x*4+3] * 255.0f;
2016                         b = 1.0f - in4f[x*4+3];
2017                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021                         pixel[x*4+0] = d[0];
2022                         pixel[x*4+1] = d[1];
2023                         pixel[x*4+2] = d[2];
2024                         pixel[x*4+3] = d[3];
2025                 }
2026                 break;
2027         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028                 for (x = startx;x < endx;x++)
2029                 {
2030                         if (!pixelmask[x])
2031                                 continue;
2032                         a = in4f[x*4+3] * 255.0f;
2033                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037                         pixel[x*4+0] = d[0];
2038                         pixel[x*4+1] = d[1];
2039                         pixel[x*4+2] = d[2];
2040                         pixel[x*4+3] = d[3];
2041                 }
2042                 break;
2043         case DPSOFTRAST_BLENDMODE_ADD:
2044                 for (x = startx;x < endx;x++)
2045                 {
2046                         if (!pixelmask[x])
2047                                 continue;
2048                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052                         pixel[x*4+0] = d[0];
2053                         pixel[x*4+1] = d[1];
2054                         pixel[x*4+2] = d[2];
2055                         pixel[x*4+3] = d[3];
2056                 }
2057                 break;
2058         case DPSOFTRAST_BLENDMODE_INVMOD:
2059                 for (x = startx;x < endx;x++)
2060                 {
2061                         if (!pixelmask[x])
2062                                 continue;
2063                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067                         pixel[x*4+0] = d[0];
2068                         pixel[x*4+1] = d[1];
2069                         pixel[x*4+2] = d[2];
2070                         pixel[x*4+3] = d[3];
2071                 }
2072                 break;
2073         case DPSOFTRAST_BLENDMODE_MUL:
2074                 for (x = startx;x < endx;x++)
2075                 {
2076                         if (!pixelmask[x])
2077                                 continue;
2078                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082                         pixel[x*4+0] = d[0];
2083                         pixel[x*4+1] = d[1];
2084                         pixel[x*4+2] = d[2];
2085                         pixel[x*4+3] = d[3];
2086                 }
2087                 break;
2088         case DPSOFTRAST_BLENDMODE_MUL2:
2089                 for (x = startx;x < endx;x++)
2090                 {
2091                         if (!pixelmask[x])
2092                                 continue;
2093                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097                         pixel[x*4+0] = d[0];
2098                         pixel[x*4+1] = d[1];
2099                         pixel[x*4+2] = d[2];
2100                         pixel[x*4+3] = d[3];
2101                 }
2102                 break;
2103         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104                 for (x = startx;x < endx;x++)
2105                 {
2106                         if (!pixelmask[x])
2107                                 continue;
2108                         a = in4f[x*4+3] * -255.0f;
2109                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113                         pixel[x*4+0] = d[0];
2114                         pixel[x*4+1] = d[1];
2115                         pixel[x*4+2] = d[2];
2116                         pixel[x*4+3] = d[3];
2117                 }
2118                 break;
2119         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120                 for (x = startx;x < endx;x++)
2121                 {
2122                         if (!pixelmask[x])
2123                                 continue;
2124                         a = 255.0f;
2125                         b = 1.0f - in4f[x*4+3];
2126                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130                         pixel[x*4+0] = d[0];
2131                         pixel[x*4+1] = d[1];
2132                         pixel[x*4+2] = d[2];
2133                         pixel[x*4+3] = d[3];
2134                 }
2135                 break;
2136         case DPSOFTRAST_BLENDMODE_INVADD:
2137                 for (x = startx;x < endx;x++)
2138                 {
2139                         if (!pixelmask[x])
2140                                 continue;
2141                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145                         pixel[x*4+0] = d[0];
2146                         pixel[x*4+1] = d[1];
2147                         pixel[x*4+2] = d[2];
2148                         pixel[x*4+3] = d[3];
2149                 }
2150                 break;
2151         }
2152 }
2153
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2155 {
2156 #ifdef SSE2_PRESENT
2157         int x;
2158         int startx = span->startx;
2159         int endx = span->endx;
2160         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161         unsigned char * RESTRICT pixelmask = span->pixelmask;
2162         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2164         if (!pixel)
2165                 return;
2166         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167         pixeli += span->y * dpsoftrast.fb_width + span->x;
2168         // handle alphatest now (this affects depth writes too)
2169         if (thread->alphatest)
2170                 for (x = startx;x < endx;x++)
2171                         if (in4ub[x*4+3] < 0.5f)
2172                                 pixelmask[x] = false;
2173         // FIXME: this does not handle bigendian
2174         switch(thread->fb_blendmode)
2175         {
2176         case DPSOFTRAST_BLENDMODE_OPAQUE:
2177                 for (x = startx;x + 4 <= endx;)
2178                 {
2179                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2180                         {
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                 x += 4;
2183                         }
2184                         else
2185                         {
2186                                 if (pixelmask[x])
2187                                         pixeli[x] = ini[x];
2188                                 x++;
2189                         }
2190                 }
2191                 for (;x < endx;x++)
2192                         if (pixelmask[x])
2193                                 pixeli[x] = ini[x];
2194                 break;
2195         case DPSOFTRAST_BLENDMODE_ALPHA:
2196         #define FINISHBLEND(blend2, blend1) \
2197                 for (x = startx;x + 1 < endx;x += 2) \
2198                 { \
2199                         __m128i src, dst; \
2200                         switch (*(const unsigned short*)&pixelmask[x]) \
2201                         { \
2202                         case 0x0101: \
2203                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2205                                 blend2; \
2206                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2207                                 continue; \
2208                         case 0x0100: \
2209                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2211                                 blend1; \
2212                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2213                                 continue; \
2214                         case 0x0001: \
2215                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2217                                 blend1; \
2218                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2219                                 continue; \
2220                         } \
2221                         break; \
2222                 } \
2223                 for(;x < endx; x++) \
2224                 { \
2225                         __m128i src, dst; \
2226                         if (!pixelmask[x]) \
2227                                 continue; \
2228                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230                         blend1; \
2231                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2232                 }
2233
2234                 FINISHBLEND({
2235                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2237                 }, {
2238                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                 });
2241                 break;
2242         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243                 FINISHBLEND({
2244                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2246                 }, {
2247                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_ADD:
2252                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2253                 break;
2254         case DPSOFTRAST_BLENDMODE_INVMOD:
2255                 FINISHBLEND({
2256                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257                 }, {
2258                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259                 });
2260                 break;
2261         case DPSOFTRAST_BLENDMODE_MUL:
2262                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2263                 break;
2264         case DPSOFTRAST_BLENDMODE_MUL2:
2265                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2268                 FINISHBLEND({
2269                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271                 }, {
2272                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                 });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2277                 FINISHBLEND({
2278                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2280                 }, {
2281                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                 });
2284                 break;
2285         case DPSOFTRAST_BLENDMODE_INVADD:
2286                 FINISHBLEND({
2287                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288                 }, {
2289                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290                 });
2291                 break;
2292         }
2293 #endif
2294 }
2295
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2297 {
2298         int x;
2299         int startx = span->startx;
2300         int endx = span->endx;
2301         int flags;
2302         float c[4];
2303         float data[4];
2304         float slope[4];
2305         float tc[2], endtc[2];
2306         float tcscale[2];
2307         unsigned int tci[2];
2308         unsigned int tci1[2];
2309         unsigned int tcimin[2];
2310         unsigned int tcimax[2];
2311         int tciwrapmask[2];
2312         int tciwidth;
2313         int filter;
2314         int mip;
2315         const unsigned char * RESTRICT pixelbase;
2316         const unsigned char * RESTRICT pixel[4];
2317         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318         // if no texture is bound, just fill it with white
2319         if (!texture)
2320         {
2321                 for (x = startx;x < endx;x++)
2322                 {
2323                         out4f[x*4+0] = 1.0f;
2324                         out4f[x*4+1] = 1.0f;
2325                         out4f[x*4+2] = 1.0f;
2326                         out4f[x*4+3] = 1.0f;
2327                 }
2328                 return;
2329         }
2330         mip = triangle->mip[texunitindex];
2331         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332         // if this mipmap of the texture is 1 pixel, just fill it with that color
2333         if (texture->mipmap[mip][1] == 4)
2334         {
2335                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339                 for (x = startx;x < endx;x++)
2340                 {
2341                         out4f[x*4+0] = c[0];
2342                         out4f[x*4+1] = c[1];
2343                         out4f[x*4+2] = c[2];
2344                         out4f[x*4+3] = c[3];
2345                 }
2346                 return;
2347         }
2348         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350         flags = texture->flags;
2351         tcscale[0] = texture->mipmap[mip][2];
2352         tcscale[1] = texture->mipmap[mip][3];
2353         tciwidth = texture->mipmap[mip][2];
2354         tcimin[0] = 0;
2355         tcimin[1] = 0;
2356         tcimax[0] = texture->mipmap[mip][2]-1;
2357         tcimax[1] = texture->mipmap[mip][3]-1;
2358         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362         for (x = startx;x < endx;)
2363         {
2364                 unsigned int subtc[2];
2365                 unsigned int substep[2];
2366                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368                 if (nextsub >= endx)
2369                 {
2370                         nextsub = endsub = endx-1;      
2371                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2372                 }
2373                 tc[0] = endtc[0];
2374                 tc[1] = endtc[1];
2375                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377                 substep[0] = (endtc[0] - tc[0]) * subscale;
2378                 substep[1] = (endtc[1] - tc[1]) * subscale;
2379                 subtc[0] = tc[0] * (1<<16);
2380                 subtc[1] = tc[1] * (1<<16);
2381                 if (filter)
2382                 {
2383                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2384                         {
2385                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2386                                 {
2387                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390                                         tci[0] = subtc[0]>>16;
2391                                         tci[1] = subtc[1]>>16;
2392                                         tci1[0] = tci[0] + 1;
2393                                         tci1[1] = tci[1] + 1;
2394                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406                                         out4f[x*4+0] = c[0];
2407                                         out4f[x*4+1] = c[1];
2408                                         out4f[x*4+2] = c[2];
2409                                         out4f[x*4+3] = c[3];
2410                                 }
2411                         }
2412                         else
2413                         {
2414                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415                                 {
2416                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419                                         tci[0] = subtc[0]>>16;
2420                                         tci[1] = subtc[1]>>16;
2421                                         tci1[0] = tci[0] + 1;
2422                                         tci1[1] = tci[1] + 1;
2423                                         tci[0] &= tciwrapmask[0];
2424                                         tci[1] &= tciwrapmask[1];
2425                                         tci1[0] &= tciwrapmask[0];
2426                                         tci1[1] &= tciwrapmask[1];
2427                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435                                         out4f[x*4+0] = c[0];
2436                                         out4f[x*4+1] = c[1];
2437                                         out4f[x*4+2] = c[2];
2438                                         out4f[x*4+3] = c[3];
2439                                 }
2440                         }
2441                 }
2442                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2443                 {
2444                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2445                         {
2446                                 tci[0] = subtc[0]>>16;
2447                                 tci[1] = subtc[1]>>16;
2448                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455                                 out4f[x*4+0] = c[0];
2456                                 out4f[x*4+1] = c[1];
2457                                 out4f[x*4+2] = c[2];
2458                                 out4f[x*4+3] = c[3];
2459                         }
2460                 }
2461                 else
2462                 {
2463                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2464                         {
2465                                 tci[0] = subtc[0]>>16;
2466                                 tci[1] = subtc[1]>>16;
2467                                 tci[0] &= tciwrapmask[0];
2468                                 tci[1] &= tciwrapmask[1];
2469                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474                                 out4f[x*4+0] = c[0];
2475                                 out4f[x*4+1] = c[1];
2476                                 out4f[x*4+2] = c[2];
2477                                 out4f[x*4+3] = c[3];
2478                         }
2479                 }
2480         }
2481 }
2482
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2484 {
2485 #ifdef SSE2_PRESENT
2486         int x;
2487         int startx = span->startx;
2488         int endx = span->endx;
2489         int flags;
2490         __m128 data, slope, tcscale;
2491         __m128i tcsize, tcmask, tcoffset, tcmax;
2492         __m128 tc, endtc;
2493         __m128i subtc, substep, endsubtc;
2494         int filter;
2495         int mip;
2496         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497         const unsigned char * RESTRICT pixelbase;
2498         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499         // if no texture is bound, just fill it with white
2500         if (!texture)
2501         {
2502                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2503                 return;
2504         }
2505         mip = triangle->mip[texunitindex];
2506         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507         // if this mipmap of the texture is 1 pixel, just fill it with that color
2508         if (texture->mipmap[mip][1] == 4)
2509         {
2510                 unsigned int k = *((const unsigned int *)pixelbase);
2511                 for (x = startx;x < endx;x++)
2512                         outi[x] = k;
2513                 return;
2514         }
2515         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517         flags = texture->flags;
2518         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520         tcscale = _mm_cvtepi32_ps(tcsize);
2521         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526         tcmax = _mm_packs_epi32(tcmask, tcmask);
2527         for (x = startx;x < endx;)
2528         {
2529                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531                 if (nextsub >= endx)
2532                 {
2533                         nextsub = endsub = endx-1;
2534                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2535                 }       
2536                 tc = endtc;
2537                 subtc = endsubtc;
2538                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542                 substep = _mm_slli_epi32(substep, 1);
2543                 if (filter)
2544                 {
2545                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2547                         {
2548                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550                                 {
2551                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2552                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553                                         tci = _mm_madd_epi16(tci, tcoffset);
2554                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560                                         fracm = _mm_srli_epi16(subtc, 1);
2561                                         pix1 = _mm_add_epi16(pix1,
2562                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564                                         pix3 = _mm_add_epi16(pix3,
2565                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569                                         pix2 = _mm_add_epi16(pix2,
2570                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2573                                 }
2574                                 if (x <= endsub)
2575                                 {
2576                                         const unsigned char * RESTRICT ptr1;
2577                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578                                         tci = _mm_madd_epi16(tci, tcoffset);
2579                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582                                         fracm = _mm_srli_epi16(subtc, 1);
2583                                         pix1 = _mm_add_epi16(pix1,
2584                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587                                         pix1 = _mm_add_epi16(pix1,
2588                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2591                                         x++;
2592                                 }
2593                         }
2594                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2595                         {
2596                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2597                                 {
2598                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600                                         tci = _mm_madd_epi16(tci, tcoffset);
2601                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603                                                                                         _mm_setzero_si128());
2604                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606                                                                                         _mm_setzero_si128());
2607                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609                                         tci = _mm_madd_epi16(tci, tcoffset);
2610                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612                                                                                         _mm_setzero_si128());
2613                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615                                                                                         _mm_setzero_si128());
2616                                         fracm = _mm_srli_epi16(subtc, 1);
2617                                         pix1 = _mm_add_epi16(pix1,
2618                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620                                         pix3 = _mm_add_epi16(pix3,
2621                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625                                         pix2 = _mm_add_epi16(pix2,
2626                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2629                                 }
2630                                 if (x <= endsub)
2631                                 {
2632                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2636                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2637                                                                                         _mm_setzero_si128());
2638                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2639                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2640                                                                                         _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646                                         pix1 = _mm_add_epi16(pix1,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2650                                         x++;
2651                                 }
2652                         }
2653                         else
2654                         {
2655                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2656                                 {
2657                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662                                                                                         _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665                                                                                         _mm_setzero_si128());
2666                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668                                         tci = _mm_madd_epi16(tci, tcoffset);
2669                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671                                                                                         _mm_setzero_si128());
2672                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674                                                                                         _mm_setzero_si128());
2675                                         fracm = _mm_srli_epi16(subtc, 1);
2676                                         pix1 = _mm_add_epi16(pix1,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679                                         pix3 = _mm_add_epi16(pix3,
2680                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684                                         pix2 = _mm_add_epi16(pix2,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2688                                 }
2689                                 if (x <= endsub)
2690                                 {
2691                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699                                                                                         _mm_setzero_si128());
2700                                         fracm = _mm_srli_epi16(subtc, 1);
2701                                         pix1 = _mm_add_epi16(pix1,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705                                         pix1 = _mm_add_epi16(pix1,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2709                                         x++;
2710                                 }
2711                         }
2712                 }
2713                 else
2714                 {
2715                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2716                         {
2717                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2718                                 {
2719                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2724                                 }
2725                                 if (x <= endsub)
2726                                 {
2727                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729                                         tci = _mm_madd_epi16(tci, tcoffset);
2730                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2731                                         x++;
2732                                 }
2733                         }
2734                         else
2735                         {
2736                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737                                 {
2738                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739                                         tci = _mm_and_si128(tci, tcmax); 
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2743                                 }
2744                                 if (x <= endsub)
2745                                 {
2746                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747                                         tci = _mm_and_si128(tci, tcmax); 
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750                                         x++;
2751                                 }
2752                         }
2753                 }
2754         }
2755 #endif
2756 }
2757
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2759 {
2760         // TODO: IMPLEMENT
2761         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2762 }
2763
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2765 {
2766         // TODO: IMPLEMENT
2767         return 1.0f;
2768 }
2769
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2771 {
2772         int x;
2773         int startx = span->startx;
2774         int endx = span->endx;
2775         float c[4];
2776         float data[4];
2777         float slope[4];
2778         float z;
2779         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780         for (x = startx;x < endx;x++)
2781         {
2782                 z = zf[x];
2783                 c[0] = (data[0] + slope[0]*x) * z;
2784                 c[1] = (data[1] + slope[1]*x) * z;
2785                 c[2] = (data[2] + slope[2]*x) * z;
2786                 c[3] = (data[3] + slope[3]*x) * z;
2787                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2791         }
2792 }
2793
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2795 {
2796         int x;
2797         int startx = span->startx;
2798         int endx = span->endx;
2799         float c[4];
2800         float data[4];
2801         float slope[4];
2802         float z;
2803         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804         for (x = startx;x < endx;x++)
2805         {
2806                 z = zf[x];
2807                 c[0] = (data[0] + slope[0]*x) * z;
2808                 c[1] = (data[1] + slope[1]*x) * z;
2809                 c[2] = (data[2] + slope[2]*x) * z;
2810                 c[3] = (data[3] + slope[3]*x) * z;
2811                 out4f[x*4+0] = c[0];
2812                 out4f[x*4+1] = c[1];
2813                 out4f[x*4+2] = c[2];
2814                 out4f[x*4+3] = c[3];
2815         }
2816 }
2817
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2819 {
2820         int x, startx = span->startx, endx = span->endx;
2821         float c[4], localcolor[4];
2822         localcolor[0] = subcolor[0];
2823         localcolor[1] = subcolor[1];
2824         localcolor[2] = subcolor[2];
2825         localcolor[3] = subcolor[3];
2826         for (x = startx;x < endx;x++)
2827         {
2828                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2836         }
2837 }
2838
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2840 {
2841         int x, startx = span->startx, endx = span->endx;
2842         for (x = startx;x < endx;x++)
2843         {
2844                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2848         }
2849 }
2850
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2852 {
2853         int x, startx = span->startx, endx = span->endx;
2854         for (x = startx;x < endx;x++)
2855         {
2856                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2860         }
2861 }
2862
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2864 {
2865         int x, startx = span->startx, endx = span->endx;
2866         float a, b;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 a = 1.0f - inb4f[x*4+3];
2870                 b = inb4f[x*4+3];
2871                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2875         }
2876 }
2877
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2879 {
2880         int x, startx = span->startx, endx = span->endx;
2881         float localcolor[4], ilerp, lerp;
2882         localcolor[0] = color[0];
2883         localcolor[1] = color[1];
2884         localcolor[2] = color[2];
2885         localcolor[3] = color[3];
2886         ilerp = 1.0f - localcolor[3];
2887         lerp = localcolor[3];
2888         for (x = startx;x < endx;x++)
2889         {
2890                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2894         }
2895 }
2896
2897
2898
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2900 {
2901 #ifdef SSE2_PRESENT
2902         int x;
2903         int startx = span->startx;
2904         int endx = span->endx;
2905         __m128 data, slope;
2906         __m128 mod, endmod;
2907         __m128i submod, substep, endsubmod;
2908         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913         for (x = startx; x < endx;)
2914         {
2915                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917                 if (nextsub >= endx)
2918                 {
2919                         nextsub = endsub = endx-1;
2920                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2921                 }
2922                 mod = endmod;
2923                 submod = endsubmod;
2924                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928                 substep = _mm_packs_epi32(substep, substep);
2929                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930                 {
2931                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932                         pix = _mm_mulhi_epu16(pix, submod);
2933                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2934                 }
2935                 if (x <= endsub)
2936                 {
2937                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938                         pix = _mm_mulhi_epu16(pix, submod);
2939                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2940                         x++;
2941                 }
2942         }
2943 #endif
2944 }
2945
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2947 {
2948 #ifdef SSE2_PRESENT
2949         int x;
2950         int startx = span->startx;
2951         int endx = span->endx;
2952         __m128 data, slope;
2953         __m128 mod, endmod;
2954         __m128i submod, substep, endsubmod;
2955         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960         for (x = startx; x < endx;)
2961         {
2962                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964                 if (nextsub >= endx)
2965                 {
2966                         nextsub = endsub = endx-1;
2967                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2968                 }
2969                 mod = endmod;
2970                 submod = endsubmod;
2971                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975                 substep = _mm_packs_epi32(substep, substep);
2976                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2977                 {
2978                         __m128i pix = _mm_srai_epi16(submod, 4);
2979                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2980                 }
2981                 if (x <= endsub)
2982                 {
2983                         __m128i pix = _mm_srai_epi16(submod, 4);
2984                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2985                         x++;
2986                 }
2987         }
2988 #endif
2989 }
2990
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2992 {
2993 #ifdef SSE2_PRESENT
2994         int x, startx = span->startx, endx = span->endx;
2995         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996         localcolor = _mm_packs_epi32(localcolor, localcolor);
2997         for (x = startx;x+2 <= endx;x+=2)
2998         {
2999                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3002                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3003         }
3004         if (x < endx)
3005         {
3006                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3009                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3010         }
3011 #endif
3012 }
3013
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3015 {
3016 #ifdef SSE2_PRESENT
3017         int x, startx = span->startx, endx = span->endx;
3018         for (x = startx;x+2 <= endx;x+=2)
3019         {
3020                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3024         }
3025         if (x < endx)
3026         {
3027                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3031         }
3032 #endif
3033 }
3034
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3036 {
3037 #ifdef SSE2_PRESENT
3038         int x, startx = span->startx, endx = span->endx;
3039         for (x = startx;x+2 <= endx;x+=2)
3040         {
3041                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043                 pix1 = _mm_add_epi16(pix1, pix2);
3044                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3045         }
3046         if (x < endx)
3047         {
3048                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050                 pix1 = _mm_add_epi16(pix1, pix2);
3051                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3052         }
3053 #endif
3054 }
3055
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3057 {
3058 #ifdef SSE2_PRESENT
3059         int x, startx = span->startx, endx = span->endx;
3060         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061         tint = _mm_packs_epi32(tint, tint);
3062         for (x = startx;x+2 <= endx;x+=2)
3063         {
3064                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3068         }
3069         if (x < endx)
3070         {
3071                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3075         }
3076 #endif
3077 }
3078
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3080 {
3081 #ifdef SSE2_PRESENT
3082         int x, startx = span->startx, endx = span->endx;
3083         for (x = startx;x+2 <= endx;x+=2)
3084         {
3085                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3090         }
3091         if (x < endx)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3098         }
3099 #endif
3100 }
3101
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3103 {
3104 #ifdef SSE2_PRESENT
3105         int x, startx = span->startx, endx = span->endx;
3106         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107         localcolor = _mm_packs_epi32(localcolor, localcolor);
3108         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109         for (x = startx;x+2 <= endx;x+=2)
3110         {
3111                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3114         }
3115         if (x < endx)
3116         {
3117                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3120         }
3121 #endif
3122 }
3123
3124
3125
3126 void DPSOFTRAST_VertexShader_Generic(void)
3127 {
3128         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3133 }
3134
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 {
3137         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3143         {
3144                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3147                 {
3148                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3150                         {
3151                                 // multiply
3152                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153                         }
3154                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3155                         {
3156                                 // add
3157                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158                         }
3159                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3160                         {
3161                                 // alphablend
3162                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3163                         }
3164                 }
3165         }
3166         else
3167                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3169 }
3170
3171
3172
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3174 {
3175         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3178 }
3179
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3181 {
3182         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3183         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3189         {
3190                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3192         }
3193         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3195         {
3196                 // TODO: implement saturation
3197         }
3198         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3199         {
3200                 // TODO: implement gammaramps
3201         }
3202         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3203 }
3204
3205
3206
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3208 {
3209         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3210 }
3211
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 {
3214         // this is never called (because colormask is off when this shader is used)
3215         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3220 }
3221
3222
3223
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3225 {
3226         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3228 }
3229
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3231 {
3232 #ifdef SSE2_PRESENT
3233         unsigned char * RESTRICT pixelmask = span->pixelmask;
3234         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235         int x, startx = span->startx, endx = span->endx;
3236         __m128i Color_Ambientm;
3237         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243                 pixel = buffer_FragColorbgra8;
3244         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248         for (x = startx;x < endx;x++)
3249         {
3250                 __m128i color, pix;
3251                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3252                 {
3253                         __m128i pix2;
3254                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3258                         x += 3;
3259                         continue;
3260                 }
3261                 if (!pixelmask[x])
3262                         continue;
3263                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266         }
3267         if (pixel == buffer_FragColorbgra8)
3268                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3269 #endif
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3279 }
3280
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3282 {
3283 #ifdef SSE2_PRESENT
3284         unsigned char * RESTRICT pixelmask = span->pixelmask;
3285         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286         int x, startx = span->startx, endx = span->endx;
3287         __m128i Color_Ambientm, Color_Diffusem;
3288         __m128 data, slope;
3289         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296                 pixel = buffer_FragColorbgra8;
3297         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3311         {
3312                 __m128i color, mod, pix;
3313                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3314                 {
3315                         __m128i pix2, mod2;
3316                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319                         data = _mm_add_ps(data, slope);
3320                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321                         data = _mm_add_ps(data, slope);
3322                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323                         data = _mm_add_ps(data, slope);
3324                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3330                         x += 3;
3331                         continue;
3332                 }
3333                 if (!pixelmask[x])
3334                         continue;
3335                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3337                 mod = _mm_packs_epi32(mod, mod);
3338                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3340         }
3341         if (pixel == buffer_FragColorbgra8)
3342                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3343 #endif
3344 }
3345
3346
3347
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3349 {
3350         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3353 }
3354
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3356 {
3357 #ifdef SSE2_PRESENT
3358         unsigned char * RESTRICT pixelmask = span->pixelmask;
3359         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360         int x, startx = span->startx, endx = span->endx;
3361         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371                 pixel = buffer_FragColorbgra8;
3372         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3380         {
3381                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386                 for (x = startx;x < endx;x++)
3387                 {
3388                         __m128i color, lightmap, glow, pix;
3389                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3390                         {
3391                                 __m128i pix2;
3392                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3396                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3399                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402                                 x += 3;
3403                                 continue;
3404                         }
3405                         if (!pixelmask[x])
3406                                 continue;
3407                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3413                 }
3414         }
3415         else
3416         {
3417                 for (x = startx;x < endx;x++)
3418                 {
3419                         __m128i color, lightmap, pix;
3420                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3421                         {
3422                                 __m128i pix2;
3423                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3426                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3430                                 x += 3;
3431                                 continue;
3432                         }
3433                         if (!pixelmask[x]) 
3434                                 continue;
3435                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439                 }
3440         }
3441         if (pixel == buffer_FragColorbgra8)
3442                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3443 #endif
3444 }
3445
3446
3447 void DPSOFTRAST_VertexShader_LightDirection(void);
3448 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3449
3450 void DPSOFTRAST_VertexShader_FakeLight(void)
3451 {
3452         DPSOFTRAST_VertexShader_LightDirection();
3453 }
3454
3455 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 {
3457         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3458 }
3459
3460
3461
3462 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3463 {
3464         DPSOFTRAST_VertexShader_LightDirection();
3465         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3466 }
3467
3468 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3469 {
3470         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3471 }
3472
3473
3474
3475 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3476 {
3477         DPSOFTRAST_VertexShader_LightDirection();
3478         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3479 }
3480
3481 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3482 {
3483         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3484 }
3485
3486
3487
3488 void DPSOFTRAST_VertexShader_LightDirection(void)
3489 {
3490         int i;
3491         int numvertices = dpsoftrast.numvertices;
3492         float LightDir[4];
3493         float LightVector[4];
3494         float EyePosition[4];
3495         float EyeVectorModelSpace[4];
3496         float EyeVector[4];
3497         float position[4];
3498         float svector[4];
3499         float tvector[4];
3500         float normal[4];
3501         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3502         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3503         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3504         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3505         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3506         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3507         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3508         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3509         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3510         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3511         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3512         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3513         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3514         for (i = 0;i < numvertices;i++)
3515         {
3516                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3517                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3518                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3519                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3520                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3521                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3522                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3523                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3524                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3525                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3526                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3527                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3528                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3529                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3530                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3531                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3532                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3533                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3534                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3535                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3536                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3537                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3538                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3539                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3540                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3541                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3542                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3545         }
3546         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3547 }
3548
3549 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3550 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3551 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3552 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3553 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3554 #define DPSOFTRAST_Vector3Normalize(v)\
3555 do\
3556 {\
3557         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3558         if (len)\
3559         {\
3560                 len = 1.0f / len;\
3561                 v[0] *= len;\
3562                 v[1] *= len;\
3563                 v[2] *= len;\
3564         }\
3565 }\
3566 while(0)
3567
3568 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3569 {
3570         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3571         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580         int x, startx = span->startx, endx = span->endx;
3581         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3582         float LightVectordata[4];
3583         float LightVectorslope[4];
3584         float EyeVectordata[4];
3585         float EyeVectorslope[4];
3586         float VectorSdata[4];
3587         float VectorSslope[4];
3588         float VectorTdata[4];
3589         float VectorTslope[4];
3590         float VectorRdata[4];
3591         float VectorRslope[4];
3592         float z;
3593         float diffusetex[4];
3594         float glosstex[4];
3595         float surfacenormal[4];
3596         float lightnormal[4];
3597         float lightnormal_modelspace[4];
3598         float eyenormal[4];
3599         float specularnormal[4];
3600         float diffuse;
3601         float specular;
3602         float SpecularPower;
3603         int d[4];
3604         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3605         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3606         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3607         Color_Glow[3] = 0.0f;
3608         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3609         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3610         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3611         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3612         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3613         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3614         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3615         Color_Pants[3] = 0.0f;
3616         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3617         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3618         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3619         Color_Shirt[3] = 0.0f;
3620         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3621         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3622         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3623         {
3624                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3625                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3626         }
3627         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3628         {
3629                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630         }
3631         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3632         {
3633                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3634                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3635                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3636                 Color_Diffuse[3] = 0.0f;
3637                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3638                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3639                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3640                 LightColor[3] = 0.0f;
3641                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3642                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3643                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3644                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3645                 Color_Specular[3] = 0.0f;
3646                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3647                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3648                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3649
3650                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3651                 {
3652                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3653                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3654                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3655                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3656                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3657                 }
3658                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3659                 {
3660                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3661                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3662                 }
3663                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3664                 {
3665                         // nothing of this needed
3666                 }
3667                 else
3668                 {
3669                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3670                 }
3671
3672                 for (x = startx;x < endx;x++)
3673                 {
3674                         z = buffer_z[x];
3675                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3676                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3677                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3678                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3679                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3680                         {
3681                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3682                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3683                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3684                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3685                         }
3686                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3687                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3688                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3689                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3690                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3691                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3692                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3693                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3694
3695                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3696                         {
3697                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3698                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3699                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3700                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3701
3702                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3703                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3704                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3705                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3706
3707                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3708                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3709                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3710                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3711
3712                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3713                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3714                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3715                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3716
3717                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3718                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3719
3720                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3721                                 {
3722                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3723                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3724                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3725                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3726                                 }
3727                         }
3728                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3729                         {
3730                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3731                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3732                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3733                                 {
3734                                         float f = 1.0f / 256.0f;
3735                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3736                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3737                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3738                                 }
3739                         }
3740                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3741                         {
3742                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3743                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3744                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3745                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3746
3747                                 LightColor[0] = 1.0;
3748                                 LightColor[1] = 1.0;
3749                                 LightColor[2] = 1.0;
3750                         }
3751                         else
3752                         {
3753                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3754                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3755                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3756                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3757                         }
3758
3759                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3760                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3761                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3762                         DPSOFTRAST_Vector3Normalize(eyenormal);
3763
3764                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3765                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3766                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3767                         DPSOFTRAST_Vector3Normalize(specularnormal);
3768
3769                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3770                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3771                         specular = pow(specular, SpecularPower * glosstex[3]);
3772                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3773                         {
3774                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3775                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3776                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3777                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3778                         }
3779                         else
3780                         {
3781                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3782                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3783                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3784                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3785                         }
3786
3787                         buffer_FragColorbgra8[x*4+0] = d[0];
3788                         buffer_FragColorbgra8[x*4+1] = d[1];
3789                         buffer_FragColorbgra8[x*4+2] = d[2];
3790                         buffer_FragColorbgra8[x*4+3] = d[3];
3791                 }
3792         }
3793         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3794         {
3795                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3796                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3797                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3798                 Color_Diffuse[3] = 0.0f;
3799                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3800                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3801                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3802                 LightColor[3] = 0.0f;
3803                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3804
3805                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3806                 {
3807                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3808                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3809                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3810                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3811                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3812                 }
3813                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3814                 {
3815                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3816                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3817                 }
3818                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3819                 {
3820                         // nothing of this needed
3821                 }
3822                 else
3823                 {
3824                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3825                 }
3826
3827                 for (x = startx;x < endx;x++)
3828                 {
3829                         z = buffer_z[x];
3830                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3831                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3832                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3833                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3834                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3835                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3836                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3837                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3838
3839                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3840                         {
3841                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3842                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3843                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3844                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3845
3846                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3847                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3848                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3849                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3850
3851                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3852                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3853                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3854                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3855
3856                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3857                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3858                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3859                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3860
3861                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3862                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3863
3864                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3865                                 {
3866                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3867                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3868                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3869                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3870                                 }
3871                         }
3872                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3873                         {
3874                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3875                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3876                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3877                                 {
3878                                         float f = 1.0f / 256.0f;
3879                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3880                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3881                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3882                                 }
3883                         }
3884                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3885                         {
3886                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3887                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3888                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3889                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3890
3891                                 LightColor[0] = 1.0;
3892                                 LightColor[1] = 1.0;
3893                                 LightColor[2] = 1.0;
3894                         }
3895                         else
3896                         {
3897                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3898                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3899                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3900                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3901                         }
3902
3903                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3904                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3905                         {
3906                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3907                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3908                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3909                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3910                         }
3911                         else
3912                         {
3913                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3914                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3915                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3916                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3917                         }
3918                         buffer_FragColorbgra8[x*4+0] = d[0];
3919                         buffer_FragColorbgra8[x*4+1] = d[1];
3920                         buffer_FragColorbgra8[x*4+2] = d[2];
3921                         buffer_FragColorbgra8[x*4+3] = d[3];
3922                 }
3923         }
3924         else
3925         {
3926                 for (x = startx;x < endx;x++)
3927                 {
3928                         z = buffer_z[x];
3929                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3930                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3931                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3932                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3933
3934                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3935                         {
3936                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3937                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3938                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3939                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3940                         }
3941                         else
3942                         {
3943                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3944                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3945                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3946                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3947                         }
3948                         buffer_FragColorbgra8[x*4+0] = d[0];
3949                         buffer_FragColorbgra8[x*4+1] = d[1];
3950                         buffer_FragColorbgra8[x*4+2] = d[2];
3951                         buffer_FragColorbgra8[x*4+3] = d[3];
3952                 }
3953         }
3954         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3955 }
3956
3957
3958
3959 void DPSOFTRAST_VertexShader_LightSource(void)
3960 {
3961         int i;
3962         int numvertices = dpsoftrast.numvertices;
3963         float LightPosition[4];
3964         float LightVector[4];
3965         float LightVectorModelSpace[4];
3966         float EyePosition[4];
3967         float EyeVectorModelSpace[4];
3968         float EyeVector[4];
3969         float position[4];
3970         float svector[4];
3971         float tvector[4];
3972         float normal[4];
3973         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3974         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3975         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3976         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3977         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3978         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3979         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3980         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3981         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3982         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3983         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3984         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3985         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3986         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3987         for (i = 0;i < numvertices;i++)
3988         {
3989                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3990                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3991                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3992                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3993                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3994                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3995                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3996                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3997                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3998                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3999                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4000                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4001                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4002                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4003                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4004                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4005                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4006                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4007                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4008                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4009                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4010                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4011                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4012                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4013                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4014                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4015                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4016                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4017                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4018                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4019                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4020                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4021         }
4022         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4023         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4024 }
4025
4026 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4027 {
4028 #ifdef SSE2_PRESENT
4029         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4030         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4031         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4032         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4033         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4034         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4035         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4036         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4037         int x, startx = span->startx, endx = span->endx;
4038         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4039         float CubeVectordata[4];
4040         float CubeVectorslope[4];
4041         float LightVectordata[4];
4042         float LightVectorslope[4];
4043         float EyeVectordata[4];
4044         float EyeVectorslope[4];
4045         float z;
4046         float diffusetex[4];
4047         float glosstex[4];
4048         float surfacenormal[4];
4049         float lightnormal[4];
4050         float eyenormal[4];
4051         float specularnormal[4];
4052         float diffuse;
4053         float specular;
4054         float SpecularPower;
4055         float CubeVector[4];
4056         float attenuation;
4057         int d[4];
4058         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4059         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4060         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4061         Color_Glow[3] = 0.0f;
4062         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4063         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4064         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4065         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4066         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4067         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4068         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4069         Color_Diffuse[3] = 0.0f;
4070         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4071         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4072         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4073         Color_Specular[3] = 0.0f;
4074         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4075         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4076         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4077         Color_Pants[3] = 0.0f;
4078         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4079         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4080         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4081         Color_Shirt[3] = 0.0f;
4082         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4083         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4084         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4085         LightColor[3] = 0.0f;
4086         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4087         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4088         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4089         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4090         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4091         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4092         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4093         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4094         {
4095                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4096                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4097         }
4098         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4099                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4100         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4101         {
4102                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4103                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4104                 for (x = startx;x < endx;x++)
4105                 {
4106                         z = buffer_z[x];
4107                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4108                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4109                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4110                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4111                         if (attenuation < 0.01f)
4112                                 continue;
4113                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4114                         {
4115                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4116                                 if (attenuation < 0.01f)
4117                                         continue;
4118                         }
4119
4120                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4121                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4122                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4123                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4124                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4125                         {
4126                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4127                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4128                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4129                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4130                         }
4131                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4132                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4133                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4134                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4135                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4136                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4137                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4138                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4139
4140                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4141                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4142                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4143                         DPSOFTRAST_Vector3Normalize(lightnormal);
4144
4145                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4146                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4147                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4148                         DPSOFTRAST_Vector3Normalize(eyenormal);
4149
4150                         specularnormal[0] = lightnormal[0] + eyenormal[0];
4151                         specularnormal[1] = lightnormal[1] + eyenormal[1];
4152                         specularnormal[2] = lightnormal[2] + eyenormal[2];
4153                         DPSOFTRAST_Vector3Normalize(specularnormal);
4154
4155                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4156                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4157                         specular = pow(specular, SpecularPower * glosstex[3]);
4158                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4159                         {
4160                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4161                                 attenuation *= (1.0f / 255.0f);
4162                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4163                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4164                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4165                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4166                         }
4167                         else
4168                         {
4169                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4170                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4171                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4172                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4173                         }
4174                         buffer_FragColorbgra8[x*4+0] = d[0];
4175                         buffer_FragColorbgra8[x*4+1] = d[1];
4176                         buffer_FragColorbgra8[x*4+2] = d[2];
4177                         buffer_FragColorbgra8[x*4+3] = d[3];
4178                 }
4179         }
4180         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4181         {
4182                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4183                 for (x = startx;x < endx;x++)
4184                 {
4185                         z = buffer_z[x];
4186                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4187                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4188                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4189                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4190                         if (attenuation < 0.01f)
4191                                 continue;
4192                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4193                         {
4194                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4195                                 if (attenuation < 0.01f)
4196                                         continue;
4197                         }
4198
4199                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4200                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4201                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4202                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4203                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4204                         {
4205                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4206                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4207                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4208                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4209                         }
4210                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4211                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4212                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4213                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4214
4215                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4216                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4217                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4218                         DPSOFTRAST_Vector3Normalize(lightnormal);
4219
4220                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4221                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4222                         {
4223                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4224                                 attenuation *= (1.0f / 255.0f);
4225                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4226                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4227                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4228                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4229                         }
4230                         else
4231                         {
4232                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4233                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4234                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4235                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4236                         }
4237                         buffer_FragColorbgra8[x*4+0] = d[0];
4238                         buffer_FragColorbgra8[x*4+1] = d[1];
4239                         buffer_FragColorbgra8[x*4+2] = d[2];
4240                         buffer_FragColorbgra8[x*4+3] = d[3];
4241                 }
4242         }
4243         else
4244         {
4245                 for (x = startx;x < endx;x++)
4246                 {
4247                         z = buffer_z[x];
4248                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4249                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4250                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4251                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4252                         if (attenuation < 0.01f)
4253                                 continue;
4254                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4255                         {
4256                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4257                                 if (attenuation < 0.01f)
4258                                         continue;
4259                         }
4260
4261                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4262                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4263                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4264                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4265                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4266                         {
4267                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4268                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4269                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4270                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4271                         }
4272                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4273                         {
4274                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4275                                 attenuation *= (1.0f / 255.0f);
4276                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4277                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4278                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4279                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4280                         }
4281                         else
4282                         {
4283                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4284                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4285                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4286                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4287                         }
4288                         buffer_FragColorbgra8[x*4+0] = d[0];
4289                         buffer_FragColorbgra8[x*4+1] = d[1];
4290                         buffer_FragColorbgra8[x*4+2] = d[2];
4291                         buffer_FragColorbgra8[x*4+3] = d[3];
4292                 }
4293         }
4294         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4295 #endif
4296 }
4297
4298
4299
4300 void DPSOFTRAST_VertexShader_Refraction(void)
4301 {
4302         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4303 }
4304
4305 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4306 {
4307         // TODO: IMPLEMENT
4308         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4309         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4310         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4311         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4312         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4313 }
4314
4315
4316
4317 void DPSOFTRAST_VertexShader_Water(void)
4318 {
4319         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4320 }
4321
4322
4323 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4324 {
4325         // TODO: IMPLEMENT
4326         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4327         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4328         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4329         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4330         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4331 }
4332
4333
4334
4335 void DPSOFTRAST_VertexShader_ShowDepth(void)
4336 {
4337         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4338 }
4339
4340 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4341 {
4342         // TODO: IMPLEMENT
4343         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4344         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4345         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4346         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4347         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4348 }
4349
4350
4351
4352 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4353 {
4354         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4355 }
4356
4357 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4358 {
4359         // TODO: IMPLEMENT
4360         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4361         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4362         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4363         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4364         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4365 }
4366
4367
4368
4369 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4370 {
4371         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4372 }
4373
4374 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4375 {
4376         // TODO: IMPLEMENT
4377         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4378         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4379         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4380         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4381         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4382 }
4383
4384
4385
4386 typedef struct DPSOFTRAST_ShaderModeInfo_s
4387 {
4388         int lodarrayindex;
4389         void (*Vertex)(void);
4390         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4391         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4392         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4393 }
4394 DPSOFTRAST_ShaderModeInfo;
4395
4396 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4397 {
4398         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4399         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4400         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4401         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4402         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4403         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4404         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4405         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4406         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4407         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4408         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4409         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4410         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4411         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4412         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4413         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4414 };
4415
4416 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4417 {
4418         int i;
4419         int x;
4420         int startx;
4421         int endx;
4422 //      unsigned int c;
4423 //      unsigned int *colorpixel;
4424         unsigned int *depthpixel;
4425         float w;
4426         float wslope;
4427         int depth;
4428         int depthslope;
4429         unsigned int d;
4430         DPSOFTRAST_State_Triangle *triangle;
4431         DPSOFTRAST_State_Span *span;
4432         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4433         for (i = 0; i < thread->numspans; i++)
4434         {
4435                 span = &thread->spans[i];
4436                 triangle = &thread->triangles[span->triangle];
4437                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4438                 {
4439                         wslope = triangle->w[0];
4440                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4441                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4442                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4443                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4444                         startx = span->startx;
4445                         endx = span->endx;
4446                         switch(thread->fb_depthfunc)
4447                         {
4448                         default:
4449                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4450                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4451                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4452                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4453                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4454                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4455                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4456                         }
4457                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4458                         //for (x = startx;x < endx;x++)
4459                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4460                         // if there is no color buffer, skip pixel shader
4461                         while (startx < endx && !pixelmask[startx])
4462                                 startx++;
4463                         while (endx > startx && !pixelmask[endx-1])
4464                                 endx--;
4465                         if (startx >= endx)
4466                                 continue; // no pixels to fill
4467                         span->pixelmask = pixelmask;
4468                         span->startx = startx;
4469                         span->endx = endx;
4470                         // run pixel shader if appropriate
4471                         // do this before running depthmask code, to allow the pixelshader
4472                         // to clear pixelmask values for alpha testing
4473                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4474                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4475                         if (thread->depthmask)
4476                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4477                                         if (pixelmask[x])
4478                                                 depthpixel[x] = d;
4479                 }
4480                 else
4481                 {
4482                         // no depth testing means we're just dealing with color...
4483                         // if there is no color buffer, skip pixel shader
4484                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4485                         {
4486                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4487                                 span->pixelmask = pixelmask;
4488                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4489                         }
4490                 }
4491         }
4492         thread->numspans = 0;
4493 }
4494
4495 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4496
4497 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4498 {
4499 #ifdef SSE2_PRESENT
4500         int cullface = thread->cullface;
4501         int minx, maxx, miny, maxy;
4502         int miny1, maxy1, miny2, maxy2;
4503         __m128i fbmin, fbmax;
4504         __m128 viewportcenter, viewportscale;
4505         int firstvertex = command->firstvertex;
4506         int numvertices = command->numvertices;
4507         int numtriangles = command->numtriangles;
4508         const int *element3i = command->element3i;
4509         const unsigned short *element3s = command->element3s;
4510         int clipped = command->clipped;
4511         int i;
4512         int j;
4513         int k;
4514         int y;
4515         int e[3];
4516         __m128i screeny;
4517         int starty, endy, bandy;
4518         int numpoints;
4519         int clipcase;
4520         float clipdist[4];
4521         __m128 triangleedge1, triangleedge2, trianglenormal;
4522         __m128 clipfrac[3];
4523         __m128 screen[4];
4524         DPSOFTRAST_State_Triangle *triangle;
4525         DPSOFTRAST_Texture *texture;
4526         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4527         miny = thread->fb_scissor[1];
4528         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4529         miny1 = bound(miny, thread->miny1, maxy);
4530         maxy1 = bound(miny, thread->maxy1, maxy);
4531         miny2 = bound(miny, thread->miny2, maxy);
4532         maxy2 = bound(miny, thread->maxy2, maxy);
4533         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4534         {
4535                 if (!ATOMIC_DECREMENT(command->refcount))
4536                 {
4537                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4538                                 MM_FREE(command->arrays);
4539                 }
4540                 return;
4541         }
4542         minx = thread->fb_scissor[0];
4543         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4544         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4545         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4546         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4547         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4548         screen[3] = _mm_setzero_ps();
4549         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4550         for (i = 0;i < numtriangles;i++)
4551         {
4552                 const float *screencoord4f = command->arrays;
4553                 const float *arrays = screencoord4f + numvertices*4;
4554
4555                 // generate the 3 edges of this triangle
4556                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4557                 if (element3s)
4558                 {
4559                         e[0] = element3s[i*3+0] - firstvertex;
4560                         e[1] = element3s[i*3+1] - firstvertex;
4561                         e[2] = element3s[i*3+2] - firstvertex;
4562                 }
4563                 else if (element3i)
4564                 {
4565                         e[0] = element3i[i*3+0] - firstvertex;
4566                         e[1] = element3i[i*3+1] - firstvertex;
4567                         e[2] = element3i[i*3+2] - firstvertex;
4568                 }
4569                 else
4570                 {
4571                         e[0] = i*3+0;
4572                         e[1] = i*3+1;
4573                         e[2] = i*3+2;
4574                 }
4575
4576 #define SKIPBACKFACE \
4577                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4578                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4579                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4580                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4581                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4582                 switch(cullface) \
4583                 { \
4584                 case GL_BACK: \
4585                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4586                                 continue; \
4587                         break; \
4588                 case GL_FRONT: \
4589                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4590                                 continue; \
4591                         break; \
4592                 }
4593
4594 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4595                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4596                         { \
4597                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4598                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4599                         }
4600 #define CLIPPEDVERTEXCOPY(k,p1) \
4601                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4602
4603 #define GENATTRIBCOPY(attrib, p1) \
4604                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4605 #define GENATTRIBLERP(attrib, p1, p2) \
4606                 { \
4607                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4608                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4609                 }
4610 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4611                 switch(clipcase) \
4612                 { \
4613                 default: \
4614                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4615                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4616                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4617                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4618                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4619                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4620                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4621                 }
4622
4623                 if (! clipped)
4624                         goto notclipped;
4625
4626                 // calculate distance from nearplane
4627                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4628                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4629                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4630                 if (clipdist[0] >= 0.0f)
4631                 {
4632                         if (clipdist[1] >= 0.0f)
4633                         {
4634                                 if (clipdist[2] >= 0.0f)
4635                                 {
4636                                 notclipped:
4637                                         // triangle is entirely in front of nearplane
4638                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4639                                         SKIPBACKFACE;
4640                                         numpoints = 3;
4641                                         clipcase = 0;
4642                                 }
4643                                 else
4644                                 {
4645                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4646                                         SKIPBACKFACE;
4647                                         numpoints = 4;
4648                                         clipcase = 1;
4649                                 }
4650                         }
4651                         else
4652                         {
4653                                 if (clipdist[2] >= 0.0f)
4654                                 {
4655                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4656                                         SKIPBACKFACE;
4657                                         numpoints = 4;
4658                                         clipcase = 2;
4659                                 }
4660                                 else
4661                                 {
4662                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4663                                         SKIPBACKFACE;
4664                                         numpoints = 3;
4665                                         clipcase = 3;
4666                                 }
4667                         }
4668                 }
4669                 else if (clipdist[1] >= 0.0f)
4670                 {
4671                         if (clipdist[2] >= 0.0f)
4672                         {
4673                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4674                                 SKIPBACKFACE;
4675                                 numpoints = 4;
4676                                 clipcase = 4;
4677                         }
4678                         else
4679                         {
4680                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4681                                 SKIPBACKFACE;
4682                                 numpoints = 3;
4683                                 clipcase = 5;
4684                         }
4685                 }
4686                 else if (clipdist[2] >= 0.0f)
4687                 {
4688                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4689                         SKIPBACKFACE;
4690                         numpoints = 3;
4691                         clipcase = 6;
4692                 }
4693                 else continue; // triangle is entirely behind nearplane
4694
4695                 {
4696                         // calculate integer y coords for triangle points
4697                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4698                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4699                                         screenmin = _mm_min_epi16(screeni, screenir),
4700                                         screenmax = _mm_max_epi16(screeni, screenir);
4701                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4702                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4703                         screenmin = _mm_max_epi16(screenmin, fbmin);
4704                         screenmax = _mm_min_epi16(screenmax, fbmax);
4705                         // skip offscreen triangles
4706                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4707                                 continue;
4708                         starty = _mm_extract_epi16(screenmin, 1);
4709                         endy = _mm_extract_epi16(screenmax, 1)+1;
4710                         if (starty >= maxy1 && endy <= miny2)
4711                                 continue;
4712                         screeny = _mm_srai_epi32(screeni, 16);
4713                 }
4714
4715                 triangle = &thread->triangles[thread->numtriangles];
4716
4717                 // calculate attribute plans for triangle data...
4718                 // okay, this triangle is going to produce spans, we'd better project
4719                 // the interpolants now (this is what gives perspective texturing),
4720                 // this consists of simply multiplying all arrays by the W coord
4721                 // (which is basically 1/Z), which will be undone per-pixel
4722                 // (multiplying by Z again) to get the perspective-correct array
4723                 // values
4724                 {
4725                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4726                         __m128 mipedgescale, mipdensity;
4727                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4728                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4729                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4730                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4731                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4732                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4733                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4734                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4735                         attribedge1 = _mm_sub_ss(w0, w1);
4736                         attribedge2 = _mm_sub_ss(w2, w1);
4737                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4738                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4739                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4740                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4741                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4742                         _mm_store_ss(&triangle->w[0], attribxslope);
4743                         _mm_store_ss(&triangle->w[1], attribyslope);
4744                         _mm_store_ss(&triangle->w[2], attriborigin);
4745                         mipedgescale = _mm_setzero_ps();
4746                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4747                         {
4748                                 __m128 attrib0, attrib1, attrib2;
4749                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4750                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4751                                         break;
4752                                 arrays += numvertices*4;
4753                                 GENATTRIBS(attrib0, attrib1, attrib2);
4754                                 attriborigin = _mm_mul_ps(attrib1, w1);
4755                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4756                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4757                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4758                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4759                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4760                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4761                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4762                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4763                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4764                                 {
4765                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4766                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4767                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4768                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4769                                 }
4770                         }
4771
4772                         memset(triangle->mip, 0, sizeof(triangle->mip));
4773                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4774                         {
4775                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4776                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4777                                         break;
4778                                 texture = thread->texbound[texunit];
4779                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4780                                 {
4781                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4782                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4783                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4784                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4785                                         // this will be multiplied in the texturing routine by the texture resolution
4786                                         y = _mm_cvtss_si32(mipdensity);
4787                                         if (y > 0)
4788                                         {
4789                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4790                                                 if (y > texture->mipmaps - 1)
4791                                                         y = texture->mipmaps - 1;
4792                                                 triangle->mip[texunit] = y;
4793                                         }
4794                                 }
4795                         }
4796                 }
4797         
4798                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4799                 for (; y < bandy;)
4800                 {
4801                         __m128 xcoords, xslope;
4802                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4803                         int yccmask = _mm_movemask_epi8(ycc);
4804                         int edge0p, edge0n, edge1p, edge1n;
4805                         int nexty;
4806                         if (numpoints == 4)
4807                         {
4808                                 switch(yccmask)
4809                                 {
4810                                 default:
4811                                 case 0xFFFF: /*0000*/ y = endy; continue;
4812                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4813                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4814                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4815                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4816                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4817                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4818                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4819                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4820                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4821                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4822                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4823                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4824                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4825                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4826                                 case 0x0000: /*1111*/ y++; continue;
4827                                 }
4828                         }
4829                         else
4830                         {
4831                                 switch(yccmask)
4832                                 {
4833                                 default:
4834                                 case 0xFFFF: /*000*/ y = endy; continue;
4835                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4836                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4837                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4838                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4839                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4840                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4841                                 case 0x0000: /*111*/ y++; continue;
4842                                 }
4843                         }
4844                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4845                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4846                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4847                         nexty = _mm_extract_epi16(ycc, 0);
4848                         if (nexty >= bandy) nexty = bandy-1;
4849                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4850                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4851                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4852                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4853                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4854                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4855                         {
4856                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4857                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4858                         }
4859                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4860                         {
4861                                 int startx, endx, offset;
4862                                 startx = _mm_cvtss_si32(xcoords);
4863                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4864                                 if (startx < minx) 
4865                                 {
4866                                         if (startx < 0) startx = 0;
4867                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4868                                 }
4869                                 if (endx > maxx) endx = maxx;
4870                                 if (startx >= endx) continue;
4871                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4872                                 {
4873                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4874                                         span->triangle = thread->numtriangles;
4875                                         span->x = offset;
4876                                         span->y = y;
4877                                         span->startx = max(minx - offset, 0);
4878                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4879                                         if (span->startx >= span->endx)
4880                                                 continue; 
4881                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4882                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4883                                 }
4884                         }
4885                 }
4886
4887                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4888                 {
4889                         DPSOFTRAST_Draw_ProcessSpans(thread);
4890                         thread->numtriangles = 0;
4891                 }
4892         }
4893
4894         if (!ATOMIC_DECREMENT(command->refcount))
4895         {
4896                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4897                         MM_FREE(command->arrays);
4898         }
4899
4900         if (thread->numspans > 0 || thread->numtriangles > 0)
4901         {
4902                 DPSOFTRAST_Draw_ProcessSpans(thread);
4903                 thread->numtriangles = 0;
4904         }
4905 #endif
4906 }
4907
4908 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4909 {
4910         int i;
4911         int j;
4912         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4913         int datasize = 2*numvertices*sizeof(float[4]);
4914         DPSOFTRAST_Command_Draw *command;
4915         unsigned char *data;
4916         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4917         {
4918                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4919                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4920                         break;
4921                 datasize += numvertices*sizeof(float[4]);
4922         }
4923         if (element3s)
4924                 datasize += numtriangles*sizeof(unsigned short[3]);
4925         else if (element3i)
4926                 datasize += numtriangles*sizeof(int[3]);
4927         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4928         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4929         {
4930                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4931                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4932         }
4933         else
4934         {
4935                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4936                 data = (unsigned char *)command + commandsize;
4937         }
4938         command->firstvertex = firstvertex;
4939         command->numvertices = numvertices;
4940         command->numtriangles = numtriangles;
4941         command->arrays = (float *)data;
4942         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4943         dpsoftrast.firstvertex = firstvertex;
4944         dpsoftrast.numvertices = numvertices;
4945         dpsoftrast.screencoord4f = (float *)data;
4946         data += numvertices*sizeof(float[4]);
4947         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4948         data += numvertices*sizeof(float[4]);
4949         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4950         {
4951                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4952                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4953                         break;
4954                 dpsoftrast.post_array4f[j] = (float *)data;
4955                 data += numvertices*sizeof(float[4]);
4956         }
4957         command->element3i = NULL;
4958         command->element3s = NULL;
4959         if (element3s)
4960         {
4961                 command->element3s = (unsigned short *)data;
4962                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4963         }
4964         else if (element3i)
4965         {
4966                 command->element3i = (int *)data;
4967                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4968         }
4969         return command;
4970 }
4971
4972 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4973 {
4974         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4975         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4976         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4977         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4978         if (command->starty >= command->endy)
4979         {
4980                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4981                         MM_FREE(command->arrays);
4982                 DPSOFTRAST_UndoCommand(command->commandsize);
4983                 return;
4984         }
4985         command->clipped = dpsoftrast.drawclipped;
4986         command->refcount = dpsoftrast.numthreads;
4987
4988         if (dpsoftrast.usethreads)
4989         {
4990                 int i;
4991                 DPSOFTRAST_Draw_SyncCommands();
4992                 for (i = 0; i < dpsoftrast.numthreads; i++)
4993                 {
4994                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4995                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4996                                 Thread_CondSignal(thread->drawcond);
4997                 }
4998         }
4999         else
5000         {
5001                 DPSOFTRAST_Draw_FlushThreads();
5002         }
5003 }
5004  
5005 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5006 {
5007         int commandoffset = thread->commandoffset;
5008         while (commandoffset != endoffset)
5009         {
5010                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5011                 switch (command->opcode)
5012                 {
5013 #define INTERPCOMMAND(name) \
5014                 case DPSOFTRAST_OPCODE_##name : \
5015                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5016                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5017                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5018                                 commandoffset = 0; \
5019                         break;
5020                 INTERPCOMMAND(Viewport)
5021                 INTERPCOMMAND(ClearColor)
5022                 INTERPCOMMAND(ClearDepth)
5023                 INTERPCOMMAND(ColorMask)
5024                 INTERPCOMMAND(DepthTest)
5025                 INTERPCOMMAND(ScissorTest)
5026                 INTERPCOMMAND(Scissor)
5027                 INTERPCOMMAND(BlendFunc)
5028                 INTERPCOMMAND(BlendSubtract)
5029                 INTERPCOMMAND(DepthMask)
5030                 INTERPCOMMAND(DepthFunc)
5031                 INTERPCOMMAND(DepthRange)
5032                 INTERPCOMMAND(PolygonOffset)
5033                 INTERPCOMMAND(CullFace)
5034                 INTERPCOMMAND(AlphaTest)
5035                 INTERPCOMMAND(AlphaFunc)
5036                 INTERPCOMMAND(SetTexture)
5037                 INTERPCOMMAND(SetShader)
5038                 INTERPCOMMAND(Uniform4f)
5039                 INTERPCOMMAND(UniformMatrix4f)
5040                 INTERPCOMMAND(Uniform1i)
5041
5042                 case DPSOFTRAST_OPCODE_Draw:
5043                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5044                         commandoffset += command->commandsize;
5045                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5046                                 commandoffset = 0;
5047                         thread->commandoffset = commandoffset;
5048                         break;
5049
5050                 case DPSOFTRAST_OPCODE_Reset:
5051                         commandoffset = 0;
5052                         break;
5053                 }
5054         }
5055         thread->commandoffset = commandoffset;
5056 }
5057
5058 static int DPSOFTRAST_Draw_Thread(void *data)
5059 {
5060         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5061         while(thread->index >= 0)
5062         {
5063                 if (thread->commandoffset != dpsoftrast.drawcommand)
5064                 {
5065                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5066                 }
5067                 else 
5068                 {
5069                         Thread_LockMutex(thread->drawmutex);
5070                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5071                         {
5072                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5073                                 thread->starving = true;
5074                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5075                                 thread->starving = false;
5076                         }
5077                         Thread_UnlockMutex(thread->drawmutex);
5078                 }
5079         }   
5080         return 0;
5081 }
5082
5083 static void DPSOFTRAST_Draw_FlushThreads(void)
5084 {
5085         DPSOFTRAST_State_Thread *thread;
5086         int i;
5087         DPSOFTRAST_Draw_SyncCommands();
5088         if (dpsoftrast.usethreads) 
5089         {
5090                 for (i = 0; i < dpsoftrast.numthreads; i++)
5091                 {
5092                         thread = &dpsoftrast.threads[i];
5093                         if (thread->commandoffset != dpsoftrast.drawcommand)
5094                         {
5095                                 Thread_LockMutex(thread->drawmutex);
5096                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5097                                         Thread_CondSignal(thread->drawcond);
5098                                 Thread_UnlockMutex(thread->drawmutex);
5099                         }
5100                 }
5101                 for (i = 0; i < dpsoftrast.numthreads; i++)
5102                 {
5103                         thread = &dpsoftrast.threads[i];
5104                         if (thread->commandoffset != dpsoftrast.drawcommand)
5105                         {
5106                                 Thread_LockMutex(thread->drawmutex);
5107                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5108                                 {
5109                                         thread->waiting = true;
5110                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5111                                         thread->waiting = false;
5112                                 }
5113                                 Thread_UnlockMutex(thread->drawmutex);
5114                         }
5115                 }
5116         }
5117         else
5118         {
5119                 for (i = 0; i < dpsoftrast.numthreads; i++)
5120                 {
5121                         thread = &dpsoftrast.threads[i];
5122                         if (thread->commandoffset != dpsoftrast.drawcommand)
5123                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5124                 }
5125         }
5126         dpsoftrast.commandpool.usedcommands = 0;
5127 }
5128
5129 void DPSOFTRAST_Flush(void)
5130 {
5131         DPSOFTRAST_Draw_FlushThreads();
5132 }
5133
5134 void DPSOFTRAST_Finish(void)
5135 {
5136         DPSOFTRAST_Flush();
5137 }
5138
5139 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5140 {
5141         int i;
5142         union
5143         {
5144                 int i;
5145                 unsigned char b[4];
5146         }
5147         u;
5148         u.i = 1;
5149         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5150         dpsoftrast.bigendian = u.b[3];
5151         dpsoftrast.fb_width = width;
5152         dpsoftrast.fb_height = height;
5153         dpsoftrast.fb_depthpixels = depthpixels;
5154         dpsoftrast.fb_colorpixels[0] = colorpixels;
5155         dpsoftrast.fb_colorpixels[1] = NULL;
5156         dpsoftrast.fb_colorpixels[1] = NULL;
5157         dpsoftrast.fb_colorpixels[1] = NULL;
5158         dpsoftrast.viewport[0] = 0;
5159         dpsoftrast.viewport[1] = 0;
5160         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5161         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5162         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5163         dpsoftrast.texture_firstfree = 1;
5164         dpsoftrast.texture_end = 1;
5165         dpsoftrast.texture_max = 0;
5166         dpsoftrast.color[0] = 1;
5167         dpsoftrast.color[1] = 1;
5168         dpsoftrast.color[2] = 1;
5169         dpsoftrast.color[3] = 1;
5170         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5171         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5172         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5173         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5174         for (i = 0; i < dpsoftrast.numthreads; i++)
5175         {
5176                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5177                 thread->index = i;
5178                 thread->cullface = GL_BACK;
5179                 thread->colormask[1] = 1;
5180                 thread->colormask[2] = 1;
5181                 thread->colormask[3] = 1;
5182                 thread->blendfunc[0] = GL_ONE;
5183                 thread->blendfunc[1] = GL_ZERO;
5184                 thread->depthmask = true;
5185                 thread->depthtest = true;
5186                 thread->depthfunc = GL_LEQUAL;
5187                 thread->scissortest = false;
5188                 thread->alphatest = false;
5189                 thread->alphafunc = GL_GREATER;
5190                 thread->alphavalue = 0.5f;
5191                 thread->viewport[0] = 0;
5192                 thread->viewport[1] = 0;
5193                 thread->viewport[2] = dpsoftrast.fb_width;
5194                 thread->viewport[3] = dpsoftrast.fb_height;
5195                 thread->scissor[0] = 0;
5196                 thread->scissor[1] = 0;
5197                 thread->scissor[2] = dpsoftrast.fb_width;
5198                 thread->scissor[3] = dpsoftrast.fb_height;
5199                 thread->depthrange[0] = 0;
5200                 thread->depthrange[1] = 1;
5201                 thread->polygonoffset[0] = 0;
5202                 thread->polygonoffset[1] = 0;
5203         
5204                 if (dpsoftrast.interlace)
5205                 {
5206                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5207                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5208                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5209                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5210                 }
5211                 else
5212                 {
5213                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5214                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5215                 }
5216
5217                 thread->numspans = 0;
5218                 thread->numtriangles = 0;
5219                 thread->commandoffset = 0;
5220                 thread->waiting = false;
5221                 thread->starving = false;
5222            
5223                 thread->validate = -1;
5224                 DPSOFTRAST_Validate(thread, -1);
5225  
5226                 if (dpsoftrast.usethreads)
5227                 {
5228                         thread->waitcond = Thread_CreateCond();
5229                         thread->drawcond = Thread_CreateCond();
5230                         thread->drawmutex = Thread_CreateMutex();
5231                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5232                 }
5233         }
5234         return 0;
5235 }
5236
5237 void DPSOFTRAST_Shutdown(void)
5238 {
5239         int i;
5240         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5241         {
5242                 DPSOFTRAST_State_Thread *thread;
5243                 for (i = 0; i < dpsoftrast.numthreads; i++)
5244                 {
5245                         thread = &dpsoftrast.threads[i];
5246                         Thread_LockMutex(thread->drawmutex);
5247                         thread->index = -1;
5248                         Thread_CondSignal(thread->drawcond);
5249                         Thread_UnlockMutex(thread->drawmutex);
5250                         Thread_WaitThread(thread->thread, 0);
5251                         Thread_DestroyCond(thread->waitcond);
5252                         Thread_DestroyCond(thread->drawcond);
5253                         Thread_DestroyMutex(thread->drawmutex);
5254                 }
5255         }
5256         for (i = 0;i < dpsoftrast.texture_end;i++)
5257                 if (dpsoftrast.texture[i].bytes)
5258                         MM_FREE(dpsoftrast.texture[i].bytes);
5259         if (dpsoftrast.texture)
5260                 free(dpsoftrast.texture);
5261         if (dpsoftrast.threads)
5262                 MM_FREE(dpsoftrast.threads);
5263         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5264 }
5265